From 4316c15a8af0e2fe709d1f6048f47835f8929845 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 12 Jun 2026 02:42:16 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Neelectric/Llama-3.1-8B-Instruct_SFT_MoTv00.02 Source: Original Platform --- .gitattributes | 36 + README.md | 59 + all_results.json | 8 + chat_template.jinja | 121 + config.json | 35 + generation_config.json | 8 + model-00001-of-00004.safetensors | 3 + model-00002-of-00004.safetensors | 3 + model-00003-of-00004.safetensors | 3 + model-00004-of-00004.safetensors | 3 + model.safetensors.index.json | 299 + special_tokens_map.json | 10 + tokenizer.json | 3 + tokenizer_config.json | 2062 ++ train_results.json | 8 + trainer_state.json | 50587 +++++++++++++++++++++++++++++ training_args.bin | 3 + 17 files changed, 53251 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00004.safetensors create mode 100644 model-00002-of-00004.safetensors create mode 100644 model-00003-of-00004.safetensors create mode 100644 model-00004-of-00004.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..a230d88 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/MoT_all_Llama3_8192toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_MoTv00.02 +tags: +- generated_from_trainer +- open-r1 +- trl +- sft +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_MoTv00.02 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/MoT_all_Llama3_8192toks](https://huggingface.co/datasets/Neelectric/MoT_all_Llama3_8192toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_MoTv00.02", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_mot/runs/yp3uk74y) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.28.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..f19afce --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 7.271307435875238e+19, + "train_loss": 0.7722587241576269, + "train_runtime": 61686.092, + "train_samples": 269513, + "train_samples_per_second": 8.738, + "train_steps_per_second": 0.091 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e1d9068 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1996dc1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..8c67e18 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:215dec89eb13a895871f51c61438f8b649a7e521e6ba6ebff1e7a2c43a7b05e2 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..421025a --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d5cabdd3db575fedae860be59829c63fbe07fc065bb0e34ab6c1776e4a52001 +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..e29887b --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e20465fed1cfbcebaf671b57dc9331c17b1655bbdf9786dd4b91d5d56f921b0 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..30789b3 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04154bda4aec717a102304e50cdd916350fed6841b81d18a46d4fa601cc51a5c +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e8f05fa --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8b0c7c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..f19afce --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 7.271307435875238e+19, + "train_loss": 0.7722587241576269, + "train_runtime": 61686.092, + "train_samples": 269513, + "train_samples_per_second": 8.738, + "train_steps_per_second": 0.091 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..77187b6 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,50587 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 5616, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00035618878005342833, + "grad_norm": 4.320767879486084, + "learning_rate": 0.0, + "loss": 1.25, + "mean_token_accuracy": 0.6661594212055206, + "num_tokens": 291467.0, + "step": 1 + }, + { + "epoch": 0.0007123775601068567, + "grad_norm": 4.449665546417236, + "learning_rate": 1.779359430604982e-09, + "loss": 1.2097, + "mean_token_accuracy": 0.673173651099205, + "num_tokens": 553983.0, + "step": 2 + }, + { + "epoch": 0.0010685663401602849, + "grad_norm": 4.320713996887207, + "learning_rate": 3.558718861209964e-09, + "loss": 1.1851, + "mean_token_accuracy": 0.6830992251634598, + "num_tokens": 824570.0, + "step": 3 + }, + { + "epoch": 0.0014247551202137133, + "grad_norm": 4.1979780197143555, + "learning_rate": 5.338078291814947e-09, + "loss": 1.1511, + "mean_token_accuracy": 0.6842109560966492, + "num_tokens": 1116086.0, + "step": 4 + }, + { + "epoch": 0.0017809439002671415, + "grad_norm": 4.05550479888916, + "learning_rate": 7.117437722419928e-09, + "loss": 1.1538, + "mean_token_accuracy": 0.6872409135103226, + "num_tokens": 1409116.0, + "step": 5 + }, + { + "epoch": 0.0021371326803205698, + "grad_norm": 4.186526775360107, + "learning_rate": 8.896797153024911e-09, + "loss": 1.1699, + "mean_token_accuracy": 0.6806549578905106, + "num_tokens": 1702964.0, + "step": 6 + }, + { + "epoch": 0.002493321460373998, + "grad_norm": 4.591403961181641, + "learning_rate": 1.0676156583629893e-08, + "loss": 1.2103, + "mean_token_accuracy": 0.672942578792572, + "num_tokens": 1960579.0, + "step": 7 + }, + { + "epoch": 0.0028495102404274266, + "grad_norm": 4.165136337280273, + "learning_rate": 1.2455516014234875e-08, + "loss": 1.1663, + "mean_token_accuracy": 0.6767878532409668, + "num_tokens": 2250401.0, + "step": 8 + }, + { + "epoch": 0.0032056990204808546, + "grad_norm": 4.326231002807617, + "learning_rate": 1.4234875444839856e-08, + "loss": 1.2331, + "mean_token_accuracy": 0.6585085391998291, + "num_tokens": 2535447.0, + "step": 9 + }, + { + "epoch": 0.003561887800534283, + "grad_norm": 4.355602741241455, + "learning_rate": 1.601423487544484e-08, + "loss": 1.2786, + "mean_token_accuracy": 0.6629510819911957, + "num_tokens": 2828727.0, + "step": 10 + }, + { + "epoch": 0.003918076580587711, + "grad_norm": 4.138643264770508, + "learning_rate": 1.7793594306049822e-08, + "loss": 1.1318, + "mean_token_accuracy": 0.6866544187068939, + "num_tokens": 3119882.0, + "step": 11 + }, + { + "epoch": 0.0042742653606411395, + "grad_norm": 4.394468307495117, + "learning_rate": 1.9572953736654804e-08, + "loss": 1.1974, + "mean_token_accuracy": 0.6755785197019577, + "num_tokens": 3397510.0, + "step": 12 + }, + { + "epoch": 0.004630454140694568, + "grad_norm": 4.313027381896973, + "learning_rate": 2.1352313167259786e-08, + "loss": 1.2476, + "mean_token_accuracy": 0.6647513210773468, + "num_tokens": 3695901.0, + "step": 13 + }, + { + "epoch": 0.004986642920747996, + "grad_norm": 4.13629674911499, + "learning_rate": 2.3131672597864765e-08, + "loss": 1.1541, + "mean_token_accuracy": 0.6917667835950851, + "num_tokens": 3981607.0, + "step": 14 + }, + { + "epoch": 0.005342831700801425, + "grad_norm": 4.123305797576904, + "learning_rate": 2.491103202846975e-08, + "loss": 1.1766, + "mean_token_accuracy": 0.6794708967208862, + "num_tokens": 4273252.0, + "step": 15 + }, + { + "epoch": 0.005699020480854853, + "grad_norm": 4.297938823699951, + "learning_rate": 2.6690391459074733e-08, + "loss": 1.1899, + "mean_token_accuracy": 0.6832784116268158, + "num_tokens": 4550751.0, + "step": 16 + }, + { + "epoch": 0.006055209260908282, + "grad_norm": 4.101216793060303, + "learning_rate": 2.8469750889679712e-08, + "loss": 1.1778, + "mean_token_accuracy": 0.684531956911087, + "num_tokens": 4849491.0, + "step": 17 + }, + { + "epoch": 0.006411398040961709, + "grad_norm": 4.154327869415283, + "learning_rate": 3.0249110320284694e-08, + "loss": 1.1891, + "mean_token_accuracy": 0.6878853738307953, + "num_tokens": 5159634.0, + "step": 18 + }, + { + "epoch": 0.006767586821015138, + "grad_norm": 4.440008163452148, + "learning_rate": 3.202846975088968e-08, + "loss": 1.1831, + "mean_token_accuracy": 0.6805078685283661, + "num_tokens": 5431623.0, + "step": 19 + }, + { + "epoch": 0.007123775601068566, + "grad_norm": 3.958987236022949, + "learning_rate": 3.3807829181494665e-08, + "loss": 1.1243, + "mean_token_accuracy": 0.6958417594432831, + "num_tokens": 5763367.0, + "step": 20 + }, + { + "epoch": 0.007479964381121995, + "grad_norm": 4.11411714553833, + "learning_rate": 3.5587188612099644e-08, + "loss": 1.2159, + "mean_token_accuracy": 0.6717262119054794, + "num_tokens": 6083562.0, + "step": 21 + }, + { + "epoch": 0.007836153161175422, + "grad_norm": 4.192266464233398, + "learning_rate": 3.736654804270462e-08, + "loss": 1.2121, + "mean_token_accuracy": 0.673796683549881, + "num_tokens": 6390417.0, + "step": 22 + }, + { + "epoch": 0.00819234194122885, + "grad_norm": 4.168990612030029, + "learning_rate": 3.914590747330961e-08, + "loss": 1.1865, + "mean_token_accuracy": 0.674561932682991, + "num_tokens": 6679228.0, + "step": 23 + }, + { + "epoch": 0.008548530721282279, + "grad_norm": 4.17656946182251, + "learning_rate": 4.092526690391459e-08, + "loss": 1.1543, + "mean_token_accuracy": 0.6812122315168381, + "num_tokens": 6964722.0, + "step": 24 + }, + { + "epoch": 0.008904719501335707, + "grad_norm": 4.058330059051514, + "learning_rate": 4.270462633451957e-08, + "loss": 1.1892, + "mean_token_accuracy": 0.678393691778183, + "num_tokens": 7282289.0, + "step": 25 + }, + { + "epoch": 0.009260908281389136, + "grad_norm": 3.9612481594085693, + "learning_rate": 4.448398576512456e-08, + "loss": 1.086, + "mean_token_accuracy": 0.7028195112943649, + "num_tokens": 7584706.0, + "step": 26 + }, + { + "epoch": 0.009617097061442564, + "grad_norm": 4.074823379516602, + "learning_rate": 4.626334519572953e-08, + "loss": 1.1594, + "mean_token_accuracy": 0.6880101561546326, + "num_tokens": 7884191.0, + "step": 27 + }, + { + "epoch": 0.009973285841495993, + "grad_norm": 4.1675262451171875, + "learning_rate": 4.8042704626334516e-08, + "loss": 1.2386, + "mean_token_accuracy": 0.6645658910274506, + "num_tokens": 8192733.0, + "step": 28 + }, + { + "epoch": 0.010329474621549421, + "grad_norm": 4.412525653839111, + "learning_rate": 4.98220640569395e-08, + "loss": 1.1609, + "mean_token_accuracy": 0.6867476999759674, + "num_tokens": 8440968.0, + "step": 29 + }, + { + "epoch": 0.01068566340160285, + "grad_norm": 4.355332374572754, + "learning_rate": 5.160142348754448e-08, + "loss": 1.2277, + "mean_token_accuracy": 0.6673260033130646, + "num_tokens": 8719468.0, + "step": 30 + }, + { + "epoch": 0.011041852181656278, + "grad_norm": 4.125389575958252, + "learning_rate": 5.3380782918149466e-08, + "loss": 1.1686, + "mean_token_accuracy": 0.6793055534362793, + "num_tokens": 9019289.0, + "step": 31 + }, + { + "epoch": 0.011398040961709707, + "grad_norm": 4.341821670532227, + "learning_rate": 5.516014234875445e-08, + "loss": 1.2658, + "mean_token_accuracy": 0.6558482199907303, + "num_tokens": 9306149.0, + "step": 32 + }, + { + "epoch": 0.011754229741763135, + "grad_norm": 3.8697962760925293, + "learning_rate": 5.6939501779359424e-08, + "loss": 1.043, + "mean_token_accuracy": 0.7134619653224945, + "num_tokens": 9606390.0, + "step": 33 + }, + { + "epoch": 0.012110418521816563, + "grad_norm": 4.159411430358887, + "learning_rate": 5.871886120996441e-08, + "loss": 1.1501, + "mean_token_accuracy": 0.6836867928504944, + "num_tokens": 9884664.0, + "step": 34 + }, + { + "epoch": 0.012466607301869992, + "grad_norm": 4.533294677734375, + "learning_rate": 6.049822064056939e-08, + "loss": 1.2433, + "mean_token_accuracy": 0.6650424599647522, + "num_tokens": 10130364.0, + "step": 35 + }, + { + "epoch": 0.012822796081923419, + "grad_norm": 4.165543556213379, + "learning_rate": 6.227758007117437e-08, + "loss": 1.1834, + "mean_token_accuracy": 0.6782118380069733, + "num_tokens": 10403747.0, + "step": 36 + }, + { + "epoch": 0.013178984861976847, + "grad_norm": 4.108343124389648, + "learning_rate": 6.405693950177936e-08, + "loss": 1.1618, + "mean_token_accuracy": 0.6874676048755646, + "num_tokens": 10687939.0, + "step": 37 + }, + { + "epoch": 0.013535173642030275, + "grad_norm": 4.056671142578125, + "learning_rate": 6.583629893238433e-08, + "loss": 1.1842, + "mean_token_accuracy": 0.6804183274507523, + "num_tokens": 10966106.0, + "step": 38 + }, + { + "epoch": 0.013891362422083704, + "grad_norm": 4.098731994628906, + "learning_rate": 6.761565836298933e-08, + "loss": 1.2227, + "mean_token_accuracy": 0.6746459007263184, + "num_tokens": 11272504.0, + "step": 39 + }, + { + "epoch": 0.014247551202137132, + "grad_norm": 4.3264570236206055, + "learning_rate": 6.93950177935943e-08, + "loss": 1.1944, + "mean_token_accuracy": 0.6800103783607483, + "num_tokens": 11530509.0, + "step": 40 + }, + { + "epoch": 0.01460373998219056, + "grad_norm": 4.153872966766357, + "learning_rate": 7.117437722419929e-08, + "loss": 1.2334, + "mean_token_accuracy": 0.6673504263162613, + "num_tokens": 11814206.0, + "step": 41 + }, + { + "epoch": 0.01495992876224399, + "grad_norm": 4.045811653137207, + "learning_rate": 7.295373665480427e-08, + "loss": 1.1403, + "mean_token_accuracy": 0.6890576183795929, + "num_tokens": 12095875.0, + "step": 42 + }, + { + "epoch": 0.015316117542297418, + "grad_norm": 4.195240020751953, + "learning_rate": 7.473309608540925e-08, + "loss": 1.2321, + "mean_token_accuracy": 0.6647895276546478, + "num_tokens": 12368653.0, + "step": 43 + }, + { + "epoch": 0.015672306322350844, + "grad_norm": 4.237295627593994, + "learning_rate": 7.651245551601423e-08, + "loss": 1.2098, + "mean_token_accuracy": 0.6790714412927628, + "num_tokens": 12644530.0, + "step": 44 + }, + { + "epoch": 0.016028495102404273, + "grad_norm": 4.076744079589844, + "learning_rate": 7.829181494661922e-08, + "loss": 1.0925, + "mean_token_accuracy": 0.7014216929674149, + "num_tokens": 12912490.0, + "step": 45 + }, + { + "epoch": 0.0163846838824577, + "grad_norm": 4.116878032684326, + "learning_rate": 8.00711743772242e-08, + "loss": 1.1299, + "mean_token_accuracy": 0.6899599879980087, + "num_tokens": 13174778.0, + "step": 46 + }, + { + "epoch": 0.01674087266251113, + "grad_norm": 4.263287544250488, + "learning_rate": 8.185053380782917e-08, + "loss": 1.2797, + "mean_token_accuracy": 0.6595384031534195, + "num_tokens": 13458467.0, + "step": 47 + }, + { + "epoch": 0.017097061442564558, + "grad_norm": 4.380281925201416, + "learning_rate": 8.362989323843416e-08, + "loss": 1.2194, + "mean_token_accuracy": 0.6674812585115433, + "num_tokens": 13714714.0, + "step": 48 + }, + { + "epoch": 0.017453250222617987, + "grad_norm": 4.034851551055908, + "learning_rate": 8.540925266903915e-08, + "loss": 1.1348, + "mean_token_accuracy": 0.6883769929409027, + "num_tokens": 13964432.0, + "step": 49 + }, + { + "epoch": 0.017809439002671415, + "grad_norm": 3.718451976776123, + "learning_rate": 8.718861209964412e-08, + "loss": 1.149, + "mean_token_accuracy": 0.6882762759923935, + "num_tokens": 14248070.0, + "step": 50 + }, + { + "epoch": 0.018165627782724843, + "grad_norm": 3.7917284965515137, + "learning_rate": 8.896797153024912e-08, + "loss": 1.1469, + "mean_token_accuracy": 0.687828540802002, + "num_tokens": 14535561.0, + "step": 51 + }, + { + "epoch": 0.018521816562778272, + "grad_norm": 3.8317673206329346, + "learning_rate": 9.074733096085409e-08, + "loss": 1.1561, + "mean_token_accuracy": 0.6851571351289749, + "num_tokens": 14811864.0, + "step": 52 + }, + { + "epoch": 0.0188780053428317, + "grad_norm": 3.7398104667663574, + "learning_rate": 9.252669039145906e-08, + "loss": 1.2208, + "mean_token_accuracy": 0.6661153584718704, + "num_tokens": 15102790.0, + "step": 53 + }, + { + "epoch": 0.01923419412288513, + "grad_norm": 3.810863733291626, + "learning_rate": 9.430604982206406e-08, + "loss": 1.1775, + "mean_token_accuracy": 0.6764954179525375, + "num_tokens": 15400116.0, + "step": 54 + }, + { + "epoch": 0.019590382902938557, + "grad_norm": 3.7737956047058105, + "learning_rate": 9.608540925266903e-08, + "loss": 1.1403, + "mean_token_accuracy": 0.6855367571115494, + "num_tokens": 15671163.0, + "step": 55 + }, + { + "epoch": 0.019946571682991986, + "grad_norm": 3.736241102218628, + "learning_rate": 9.786476868327402e-08, + "loss": 1.1639, + "mean_token_accuracy": 0.6807015240192413, + "num_tokens": 15972383.0, + "step": 56 + }, + { + "epoch": 0.020302760463045414, + "grad_norm": 3.5165903568267822, + "learning_rate": 9.9644128113879e-08, + "loss": 1.065, + "mean_token_accuracy": 0.7037084549665451, + "num_tokens": 16275617.0, + "step": 57 + }, + { + "epoch": 0.020658949243098843, + "grad_norm": 3.5096795558929443, + "learning_rate": 1.0142348754448399e-07, + "loss": 1.1012, + "mean_token_accuracy": 0.6970059126615524, + "num_tokens": 16583815.0, + "step": 58 + }, + { + "epoch": 0.02101513802315227, + "grad_norm": 3.7062926292419434, + "learning_rate": 1.0320284697508896e-07, + "loss": 1.1933, + "mean_token_accuracy": 0.6775388866662979, + "num_tokens": 16893538.0, + "step": 59 + }, + { + "epoch": 0.0213713268032057, + "grad_norm": 3.490142583847046, + "learning_rate": 1.0498220640569395e-07, + "loss": 1.0427, + "mean_token_accuracy": 0.7150713503360748, + "num_tokens": 17196245.0, + "step": 60 + }, + { + "epoch": 0.021727515583259128, + "grad_norm": 3.990615129470825, + "learning_rate": 1.0676156583629893e-07, + "loss": 1.253, + "mean_token_accuracy": 0.6626797020435333, + "num_tokens": 17465301.0, + "step": 61 + }, + { + "epoch": 0.022083704363312556, + "grad_norm": 3.8062329292297363, + "learning_rate": 1.085409252669039e-07, + "loss": 1.1916, + "mean_token_accuracy": 0.6781026422977448, + "num_tokens": 17743103.0, + "step": 62 + }, + { + "epoch": 0.022439893143365985, + "grad_norm": 3.6344122886657715, + "learning_rate": 1.103202846975089e-07, + "loss": 1.1487, + "mean_token_accuracy": 0.6853166967630386, + "num_tokens": 18044303.0, + "step": 63 + }, + { + "epoch": 0.022796081923419413, + "grad_norm": 3.9079604148864746, + "learning_rate": 1.1209964412811388e-07, + "loss": 1.214, + "mean_token_accuracy": 0.6669175773859024, + "num_tokens": 18318225.0, + "step": 64 + }, + { + "epoch": 0.02315227070347284, + "grad_norm": 3.8242387771606445, + "learning_rate": 1.1387900355871885e-07, + "loss": 1.1991, + "mean_token_accuracy": 0.6760813742876053, + "num_tokens": 18615536.0, + "step": 65 + }, + { + "epoch": 0.02350845948352627, + "grad_norm": 3.5899531841278076, + "learning_rate": 1.1565836298932385e-07, + "loss": 1.1432, + "mean_token_accuracy": 0.6861843764781952, + "num_tokens": 18947365.0, + "step": 66 + }, + { + "epoch": 0.0238646482635797, + "grad_norm": 3.820950984954834, + "learning_rate": 1.1743772241992882e-07, + "loss": 1.1807, + "mean_token_accuracy": 0.6778843253850937, + "num_tokens": 19211745.0, + "step": 67 + }, + { + "epoch": 0.024220837043633127, + "grad_norm": 3.591355800628662, + "learning_rate": 1.1921708185053382e-07, + "loss": 1.183, + "mean_token_accuracy": 0.6775119453668594, + "num_tokens": 19474225.0, + "step": 68 + }, + { + "epoch": 0.024577025823686555, + "grad_norm": 3.423020362854004, + "learning_rate": 1.2099644128113878e-07, + "loss": 1.1123, + "mean_token_accuracy": 0.6830073744058609, + "num_tokens": 19731655.0, + "step": 69 + }, + { + "epoch": 0.024933214603739984, + "grad_norm": 3.231109380722046, + "learning_rate": 1.2277580071174376e-07, + "loss": 1.1228, + "mean_token_accuracy": 0.6918217837810516, + "num_tokens": 20033124.0, + "step": 70 + }, + { + "epoch": 0.025289403383793412, + "grad_norm": 3.2652082443237305, + "learning_rate": 1.2455516014234875e-07, + "loss": 1.0961, + "mean_token_accuracy": 0.696757510304451, + "num_tokens": 20298087.0, + "step": 71 + }, + { + "epoch": 0.025645592163846837, + "grad_norm": 3.199470281600952, + "learning_rate": 1.2633451957295373e-07, + "loss": 1.1565, + "mean_token_accuracy": 0.6799522638320923, + "num_tokens": 20579842.0, + "step": 72 + }, + { + "epoch": 0.026001780943900266, + "grad_norm": 3.1091482639312744, + "learning_rate": 1.2811387900355872e-07, + "loss": 1.1732, + "mean_token_accuracy": 0.6808935403823853, + "num_tokens": 20855756.0, + "step": 73 + }, + { + "epoch": 0.026357969723953694, + "grad_norm": 2.8105270862579346, + "learning_rate": 1.298932384341637e-07, + "loss": 1.022, + "mean_token_accuracy": 0.7098027020692825, + "num_tokens": 21172954.0, + "step": 74 + }, + { + "epoch": 0.026714158504007122, + "grad_norm": 3.0396275520324707, + "learning_rate": 1.3167259786476866e-07, + "loss": 1.1497, + "mean_token_accuracy": 0.6806823462247849, + "num_tokens": 21425188.0, + "step": 75 + }, + { + "epoch": 0.02707034728406055, + "grad_norm": 2.7914083003997803, + "learning_rate": 1.3345195729537365e-07, + "loss": 1.0868, + "mean_token_accuracy": 0.6895091831684113, + "num_tokens": 21724701.0, + "step": 76 + }, + { + "epoch": 0.02742653606411398, + "grad_norm": 2.738603353500366, + "learning_rate": 1.3523131672597866e-07, + "loss": 1.0791, + "mean_token_accuracy": 0.6929945051670074, + "num_tokens": 22025706.0, + "step": 77 + }, + { + "epoch": 0.027782724844167408, + "grad_norm": 2.9877126216888428, + "learning_rate": 1.3701067615658362e-07, + "loss": 1.1784, + "mean_token_accuracy": 0.6744126677513123, + "num_tokens": 22299660.0, + "step": 78 + }, + { + "epoch": 0.028138913624220836, + "grad_norm": 2.9029383659362793, + "learning_rate": 1.387900355871886e-07, + "loss": 1.1776, + "mean_token_accuracy": 0.6756375879049301, + "num_tokens": 22585177.0, + "step": 79 + }, + { + "epoch": 0.028495102404274265, + "grad_norm": 2.6541619300842285, + "learning_rate": 1.405693950177936e-07, + "loss": 1.0858, + "mean_token_accuracy": 0.6981464922428131, + "num_tokens": 22918264.0, + "step": 80 + }, + { + "epoch": 0.028851291184327693, + "grad_norm": 2.7806925773620605, + "learning_rate": 1.4234875444839858e-07, + "loss": 1.0957, + "mean_token_accuracy": 0.6895806789398193, + "num_tokens": 23191750.0, + "step": 81 + }, + { + "epoch": 0.02920747996438112, + "grad_norm": 2.7091400623321533, + "learning_rate": 1.4412811387900356e-07, + "loss": 1.1267, + "mean_token_accuracy": 0.68799889087677, + "num_tokens": 23494473.0, + "step": 82 + }, + { + "epoch": 0.02956366874443455, + "grad_norm": 2.603060245513916, + "learning_rate": 1.4590747330960855e-07, + "loss": 1.124, + "mean_token_accuracy": 0.6888981610536575, + "num_tokens": 23788978.0, + "step": 83 + }, + { + "epoch": 0.02991985752448798, + "grad_norm": 2.6328485012054443, + "learning_rate": 1.476868327402135e-07, + "loss": 1.0776, + "mean_token_accuracy": 0.696533277630806, + "num_tokens": 24076138.0, + "step": 84 + }, + { + "epoch": 0.030276046304541407, + "grad_norm": 2.5288712978363037, + "learning_rate": 1.494661921708185e-07, + "loss": 1.1173, + "mean_token_accuracy": 0.6951026916503906, + "num_tokens": 24381018.0, + "step": 85 + }, + { + "epoch": 0.030632235084594835, + "grad_norm": 2.5778894424438477, + "learning_rate": 1.512455516014235e-07, + "loss": 1.1555, + "mean_token_accuracy": 0.6800205111503601, + "num_tokens": 24691333.0, + "step": 86 + }, + { + "epoch": 0.030988423864648264, + "grad_norm": 2.705747365951538, + "learning_rate": 1.5302491103202846e-07, + "loss": 1.1297, + "mean_token_accuracy": 0.6897296011447906, + "num_tokens": 24966196.0, + "step": 87 + }, + { + "epoch": 0.03134461264470169, + "grad_norm": 2.5825278759002686, + "learning_rate": 1.5480427046263345e-07, + "loss": 1.1911, + "mean_token_accuracy": 0.6708679348230362, + "num_tokens": 25287002.0, + "step": 88 + }, + { + "epoch": 0.03170080142475512, + "grad_norm": 2.633972644805908, + "learning_rate": 1.5658362989323843e-07, + "loss": 1.0553, + "mean_token_accuracy": 0.7013304233551025, + "num_tokens": 25556854.0, + "step": 89 + }, + { + "epoch": 0.032056990204808546, + "grad_norm": 2.4419362545013428, + "learning_rate": 1.583629893238434e-07, + "loss": 1.1075, + "mean_token_accuracy": 0.6876514405012131, + "num_tokens": 25872911.0, + "step": 90 + }, + { + "epoch": 0.03241317898486198, + "grad_norm": 2.4279234409332275, + "learning_rate": 1.601423487544484e-07, + "loss": 1.0832, + "mean_token_accuracy": 0.697997510433197, + "num_tokens": 26153776.0, + "step": 91 + }, + { + "epoch": 0.0327693677649154, + "grad_norm": 2.3850576877593994, + "learning_rate": 1.619217081850534e-07, + "loss": 1.0634, + "mean_token_accuracy": 0.6993038207292557, + "num_tokens": 26414286.0, + "step": 92 + }, + { + "epoch": 0.033125556544968834, + "grad_norm": 2.3241846561431885, + "learning_rate": 1.6370106761565835e-07, + "loss": 1.0442, + "mean_token_accuracy": 0.7039203941822052, + "num_tokens": 26692119.0, + "step": 93 + }, + { + "epoch": 0.03348174532502226, + "grad_norm": 2.1554622650146484, + "learning_rate": 1.6548042704626334e-07, + "loss": 1.0813, + "mean_token_accuracy": 0.6971811205148697, + "num_tokens": 26993029.0, + "step": 94 + }, + { + "epoch": 0.03383793410507569, + "grad_norm": 2.0872504711151123, + "learning_rate": 1.6725978647686832e-07, + "loss": 1.0702, + "mean_token_accuracy": 0.7021628469228745, + "num_tokens": 27278023.0, + "step": 95 + }, + { + "epoch": 0.034194122885129116, + "grad_norm": 2.0591022968292236, + "learning_rate": 1.690391459074733e-07, + "loss": 1.1195, + "mean_token_accuracy": 0.6862035393714905, + "num_tokens": 27581462.0, + "step": 96 + }, + { + "epoch": 0.03455031166518255, + "grad_norm": 2.0124263763427734, + "learning_rate": 1.708185053380783e-07, + "loss": 1.1177, + "mean_token_accuracy": 0.6875555664300919, + "num_tokens": 27870628.0, + "step": 97 + }, + { + "epoch": 0.03490650044523597, + "grad_norm": 1.95350182056427, + "learning_rate": 1.7259786476868328e-07, + "loss": 1.0902, + "mean_token_accuracy": 0.6871015578508377, + "num_tokens": 28154801.0, + "step": 98 + }, + { + "epoch": 0.035262689225289405, + "grad_norm": 1.873213529586792, + "learning_rate": 1.7437722419928824e-07, + "loss": 1.0682, + "mean_token_accuracy": 0.6964186578989029, + "num_tokens": 28454385.0, + "step": 99 + }, + { + "epoch": 0.03561887800534283, + "grad_norm": 1.8682081699371338, + "learning_rate": 1.7615658362989322e-07, + "loss": 1.0883, + "mean_token_accuracy": 0.6950496435165405, + "num_tokens": 28755186.0, + "step": 100 + }, + { + "epoch": 0.03597506678539626, + "grad_norm": 1.8708288669586182, + "learning_rate": 1.7793594306049823e-07, + "loss": 1.0538, + "mean_token_accuracy": 0.6967617124319077, + "num_tokens": 29033855.0, + "step": 101 + }, + { + "epoch": 0.03633125556544969, + "grad_norm": 1.8984884023666382, + "learning_rate": 1.797153024911032e-07, + "loss": 1.0213, + "mean_token_accuracy": 0.7096554040908813, + "num_tokens": 29326824.0, + "step": 102 + }, + { + "epoch": 0.03668744434550312, + "grad_norm": 1.8537520170211792, + "learning_rate": 1.8149466192170818e-07, + "loss": 1.0526, + "mean_token_accuracy": 0.6997749358415604, + "num_tokens": 29604242.0, + "step": 103 + }, + { + "epoch": 0.037043633125556544, + "grad_norm": 1.8823299407958984, + "learning_rate": 1.8327402135231316e-07, + "loss": 1.0626, + "mean_token_accuracy": 0.7015951573848724, + "num_tokens": 29938565.0, + "step": 104 + }, + { + "epoch": 0.037399821905609976, + "grad_norm": 1.7856526374816895, + "learning_rate": 1.8505338078291812e-07, + "loss": 1.0088, + "mean_token_accuracy": 0.7123600989580154, + "num_tokens": 30234738.0, + "step": 105 + }, + { + "epoch": 0.0377560106856634, + "grad_norm": 1.9855825901031494, + "learning_rate": 1.8683274021352314e-07, + "loss": 1.0919, + "mean_token_accuracy": 0.6950525641441345, + "num_tokens": 30498530.0, + "step": 106 + }, + { + "epoch": 0.03811219946571683, + "grad_norm": 1.7886571884155273, + "learning_rate": 1.8861209964412812e-07, + "loss": 1.0118, + "mean_token_accuracy": 0.7054216712713242, + "num_tokens": 30812392.0, + "step": 107 + }, + { + "epoch": 0.03846838824577026, + "grad_norm": 2.3375041484832764, + "learning_rate": 1.9039145907473308e-07, + "loss": 1.0164, + "mean_token_accuracy": 0.7097709625959396, + "num_tokens": 31085830.0, + "step": 108 + }, + { + "epoch": 0.03882457702582369, + "grad_norm": 1.9610555171966553, + "learning_rate": 1.9217081850533807e-07, + "loss": 1.0284, + "mean_token_accuracy": 0.7056591510772705, + "num_tokens": 31380165.0, + "step": 109 + }, + { + "epoch": 0.039180765805877114, + "grad_norm": 2.18061900138855, + "learning_rate": 1.9395017793594305e-07, + "loss": 1.0674, + "mean_token_accuracy": 0.6946629136800766, + "num_tokens": 31672966.0, + "step": 110 + }, + { + "epoch": 0.039536954585930546, + "grad_norm": 2.19195294380188, + "learning_rate": 1.9572953736654804e-07, + "loss": 1.0324, + "mean_token_accuracy": 0.701454222202301, + "num_tokens": 31937851.0, + "step": 111 + }, + { + "epoch": 0.03989314336598397, + "grad_norm": 2.00923752784729, + "learning_rate": 1.9750889679715302e-07, + "loss": 1.0668, + "mean_token_accuracy": 0.6968457698822021, + "num_tokens": 32230358.0, + "step": 112 + }, + { + "epoch": 0.0402493321460374, + "grad_norm": 2.1728146076202393, + "learning_rate": 1.99288256227758e-07, + "loss": 1.1265, + "mean_token_accuracy": 0.6853938698768616, + "num_tokens": 32518798.0, + "step": 113 + }, + { + "epoch": 0.04060552092609083, + "grad_norm": 1.7728931903839111, + "learning_rate": 2.0106761565836297e-07, + "loss": 1.0055, + "mean_token_accuracy": 0.7120089828968048, + "num_tokens": 32813631.0, + "step": 114 + }, + { + "epoch": 0.04096170970614425, + "grad_norm": 1.9216030836105347, + "learning_rate": 2.0284697508896798e-07, + "loss": 1.0118, + "mean_token_accuracy": 0.7117805927991867, + "num_tokens": 33105817.0, + "step": 115 + }, + { + "epoch": 0.041317898486197685, + "grad_norm": 2.2709245681762695, + "learning_rate": 2.0462633451957296e-07, + "loss": 1.0743, + "mean_token_accuracy": 0.6916919499635696, + "num_tokens": 33398003.0, + "step": 116 + }, + { + "epoch": 0.04167408726625111, + "grad_norm": 2.0474367141723633, + "learning_rate": 2.0640569395017792e-07, + "loss": 1.0664, + "mean_token_accuracy": 0.6990501135587692, + "num_tokens": 33700294.0, + "step": 117 + }, + { + "epoch": 0.04203027604630454, + "grad_norm": 2.0752437114715576, + "learning_rate": 2.081850533807829e-07, + "loss": 1.0446, + "mean_token_accuracy": 0.7022008746862411, + "num_tokens": 33985819.0, + "step": 118 + }, + { + "epoch": 0.04238646482635797, + "grad_norm": 2.3876030445098877, + "learning_rate": 2.099644128113879e-07, + "loss": 1.0407, + "mean_token_accuracy": 0.7015276998281479, + "num_tokens": 34275000.0, + "step": 119 + }, + { + "epoch": 0.0427426536064114, + "grad_norm": 1.782159447669983, + "learning_rate": 2.1174377224199288e-07, + "loss": 1.0519, + "mean_token_accuracy": 0.7025104463100433, + "num_tokens": 34547859.0, + "step": 120 + }, + { + "epoch": 0.043098842386464824, + "grad_norm": 1.607873558998108, + "learning_rate": 2.1352313167259786e-07, + "loss": 1.0915, + "mean_token_accuracy": 0.6948998421430588, + "num_tokens": 34851903.0, + "step": 121 + }, + { + "epoch": 0.043455031166518256, + "grad_norm": 1.7159550189971924, + "learning_rate": 2.1530249110320285e-07, + "loss": 1.0042, + "mean_token_accuracy": 0.7131743729114532, + "num_tokens": 35165195.0, + "step": 122 + }, + { + "epoch": 0.04381121994657168, + "grad_norm": 2.2038748264312744, + "learning_rate": 2.170818505338078e-07, + "loss": 1.0725, + "mean_token_accuracy": 0.689026951789856, + "num_tokens": 35426601.0, + "step": 123 + }, + { + "epoch": 0.04416740872662511, + "grad_norm": 2.2771265506744385, + "learning_rate": 2.188612099644128e-07, + "loss": 1.0133, + "mean_token_accuracy": 0.7077834457159042, + "num_tokens": 35727742.0, + "step": 124 + }, + { + "epoch": 0.04452359750667854, + "grad_norm": 2.1834189891815186, + "learning_rate": 2.206405693950178e-07, + "loss": 1.0002, + "mean_token_accuracy": 0.7076937705278397, + "num_tokens": 36005349.0, + "step": 125 + }, + { + "epoch": 0.04487978628673197, + "grad_norm": 4.246620178222656, + "learning_rate": 2.2241992882562277e-07, + "loss": 0.9436, + "mean_token_accuracy": 0.7247613221406937, + "num_tokens": 36297434.0, + "step": 126 + }, + { + "epoch": 0.045235975066785394, + "grad_norm": 2.5922951698303223, + "learning_rate": 2.2419928825622775e-07, + "loss": 0.9676, + "mean_token_accuracy": 0.7191519439220428, + "num_tokens": 36582849.0, + "step": 127 + }, + { + "epoch": 0.045592163846838826, + "grad_norm": 2.3888866901397705, + "learning_rate": 2.2597864768683274e-07, + "loss": 1.0382, + "mean_token_accuracy": 0.7063083052635193, + "num_tokens": 36842209.0, + "step": 128 + }, + { + "epoch": 0.04594835262689225, + "grad_norm": 2.2747912406921387, + "learning_rate": 2.277580071174377e-07, + "loss": 1.0938, + "mean_token_accuracy": 0.688449501991272, + "num_tokens": 37087128.0, + "step": 129 + }, + { + "epoch": 0.04630454140694568, + "grad_norm": 1.8594565391540527, + "learning_rate": 2.295373665480427e-07, + "loss": 1.023, + "mean_token_accuracy": 0.7025166749954224, + "num_tokens": 37375053.0, + "step": 130 + }, + { + "epoch": 0.04666073018699911, + "grad_norm": 2.4591805934906006, + "learning_rate": 2.313167259786477e-07, + "loss": 1.002, + "mean_token_accuracy": 0.706584095954895, + "num_tokens": 37660279.0, + "step": 131 + }, + { + "epoch": 0.04701691896705254, + "grad_norm": 2.0601272583007812, + "learning_rate": 2.3309608540925265e-07, + "loss": 1.0011, + "mean_token_accuracy": 0.7081160545349121, + "num_tokens": 37937426.0, + "step": 132 + }, + { + "epoch": 0.047373107747105965, + "grad_norm": 1.7133070230484009, + "learning_rate": 2.3487544483985764e-07, + "loss": 1.0004, + "mean_token_accuracy": 0.7075323313474655, + "num_tokens": 38226011.0, + "step": 133 + }, + { + "epoch": 0.0477292965271594, + "grad_norm": 1.737507939338684, + "learning_rate": 2.366548042704626e-07, + "loss": 0.9673, + "mean_token_accuracy": 0.7157670110464096, + "num_tokens": 38541415.0, + "step": 134 + }, + { + "epoch": 0.04808548530721282, + "grad_norm": 2.7653300762176514, + "learning_rate": 2.3843416370106764e-07, + "loss": 0.9822, + "mean_token_accuracy": 0.7129713147878647, + "num_tokens": 38807738.0, + "step": 135 + }, + { + "epoch": 0.048441674087266254, + "grad_norm": 2.0446248054504395, + "learning_rate": 2.4021352313167257e-07, + "loss": 0.9604, + "mean_token_accuracy": 0.7129597663879395, + "num_tokens": 39100713.0, + "step": 136 + }, + { + "epoch": 0.04879786286731968, + "grad_norm": 2.2602169513702393, + "learning_rate": 2.4199288256227755e-07, + "loss": 0.9399, + "mean_token_accuracy": 0.7203000634908676, + "num_tokens": 39373406.0, + "step": 137 + }, + { + "epoch": 0.04915405164737311, + "grad_norm": 1.7390481233596802, + "learning_rate": 2.4377224199288254e-07, + "loss": 0.986, + "mean_token_accuracy": 0.7117860615253448, + "num_tokens": 39641764.0, + "step": 138 + }, + { + "epoch": 0.049510240427426536, + "grad_norm": 1.5277738571166992, + "learning_rate": 2.455516014234875e-07, + "loss": 0.986, + "mean_token_accuracy": 0.7138210982084274, + "num_tokens": 39943523.0, + "step": 139 + }, + { + "epoch": 0.04986642920747997, + "grad_norm": 2.0579216480255127, + "learning_rate": 2.473309608540925e-07, + "loss": 0.9405, + "mean_token_accuracy": 0.7281115055084229, + "num_tokens": 40251230.0, + "step": 140 + }, + { + "epoch": 0.05022261798753339, + "grad_norm": 2.291762351989746, + "learning_rate": 2.491103202846975e-07, + "loss": 0.9312, + "mean_token_accuracy": 0.7216415256261826, + "num_tokens": 40489555.0, + "step": 141 + }, + { + "epoch": 0.050578806767586824, + "grad_norm": 1.648435115814209, + "learning_rate": 2.508896797153025e-07, + "loss": 0.9822, + "mean_token_accuracy": 0.712884396314621, + "num_tokens": 40793955.0, + "step": 142 + }, + { + "epoch": 0.05093499554764025, + "grad_norm": 1.5279895067214966, + "learning_rate": 2.5266903914590747e-07, + "loss": 1.0772, + "mean_token_accuracy": 0.6857463121414185, + "num_tokens": 41111136.0, + "step": 143 + }, + { + "epoch": 0.051291184327693674, + "grad_norm": 1.653621792793274, + "learning_rate": 2.5444839857651245e-07, + "loss": 1.0094, + "mean_token_accuracy": 0.711881548166275, + "num_tokens": 41394841.0, + "step": 144 + }, + { + "epoch": 0.051647373107747106, + "grad_norm": 1.3503801822662354, + "learning_rate": 2.5622775800711744e-07, + "loss": 0.9448, + "mean_token_accuracy": 0.7205002754926682, + "num_tokens": 41722631.0, + "step": 145 + }, + { + "epoch": 0.05200356188780053, + "grad_norm": 2.0390210151672363, + "learning_rate": 2.580071174377224e-07, + "loss": 0.9474, + "mean_token_accuracy": 0.7237126380205154, + "num_tokens": 42038757.0, + "step": 146 + }, + { + "epoch": 0.05235975066785396, + "grad_norm": 1.5603787899017334, + "learning_rate": 2.597864768683274e-07, + "loss": 1.0451, + "mean_token_accuracy": 0.6993799209594727, + "num_tokens": 42339143.0, + "step": 147 + }, + { + "epoch": 0.05271593944790739, + "grad_norm": 1.6321159601211548, + "learning_rate": 2.6156583629893234e-07, + "loss": 1.038, + "mean_token_accuracy": 0.7015133649110794, + "num_tokens": 42652161.0, + "step": 148 + }, + { + "epoch": 0.05307212822796082, + "grad_norm": 1.8708714246749878, + "learning_rate": 2.6334519572953733e-07, + "loss": 0.9869, + "mean_token_accuracy": 0.7105865329504013, + "num_tokens": 42938680.0, + "step": 149 + }, + { + "epoch": 0.053428317008014245, + "grad_norm": 1.5119789838790894, + "learning_rate": 2.651245551601423e-07, + "loss": 0.9723, + "mean_token_accuracy": 0.720861554145813, + "num_tokens": 43191180.0, + "step": 150 + }, + { + "epoch": 0.05378450578806768, + "grad_norm": 1.7853724956512451, + "learning_rate": 2.669039145907473e-07, + "loss": 1.014, + "mean_token_accuracy": 0.7039236128330231, + "num_tokens": 43511018.0, + "step": 151 + }, + { + "epoch": 0.0541406945681211, + "grad_norm": 1.6585553884506226, + "learning_rate": 2.6868327402135234e-07, + "loss": 0.9547, + "mean_token_accuracy": 0.7229085713624954, + "num_tokens": 43823833.0, + "step": 152 + }, + { + "epoch": 0.054496883348174534, + "grad_norm": 1.45585036277771, + "learning_rate": 2.704626334519573e-07, + "loss": 1.0291, + "mean_token_accuracy": 0.7003869861364365, + "num_tokens": 44128838.0, + "step": 153 + }, + { + "epoch": 0.05485307212822796, + "grad_norm": 1.693566083908081, + "learning_rate": 2.7224199288256225e-07, + "loss": 0.976, + "mean_token_accuracy": 0.7171164751052856, + "num_tokens": 44394166.0, + "step": 154 + }, + { + "epoch": 0.05520926090828139, + "grad_norm": 1.727564811706543, + "learning_rate": 2.7402135231316724e-07, + "loss": 0.8688, + "mean_token_accuracy": 0.7434722036123276, + "num_tokens": 44698246.0, + "step": 155 + }, + { + "epoch": 0.055565449688334816, + "grad_norm": 1.4472321271896362, + "learning_rate": 2.758007117437722e-07, + "loss": 0.987, + "mean_token_accuracy": 0.7062523066997528, + "num_tokens": 44966325.0, + "step": 156 + }, + { + "epoch": 0.05592163846838825, + "grad_norm": 1.667776346206665, + "learning_rate": 2.775800711743772e-07, + "loss": 0.973, + "mean_token_accuracy": 0.7165121734142303, + "num_tokens": 45245139.0, + "step": 157 + }, + { + "epoch": 0.05627782724844167, + "grad_norm": 1.4901535511016846, + "learning_rate": 2.793594306049822e-07, + "loss": 1.0001, + "mean_token_accuracy": 0.7087368965148926, + "num_tokens": 45515802.0, + "step": 158 + }, + { + "epoch": 0.056634016028495104, + "grad_norm": 1.9539861679077148, + "learning_rate": 2.811387900355872e-07, + "loss": 0.9267, + "mean_token_accuracy": 0.7313272655010223, + "num_tokens": 45787073.0, + "step": 159 + }, + { + "epoch": 0.05699020480854853, + "grad_norm": 1.720589280128479, + "learning_rate": 2.829181494661921e-07, + "loss": 0.9742, + "mean_token_accuracy": 0.7069613039493561, + "num_tokens": 46060339.0, + "step": 160 + }, + { + "epoch": 0.05734639358860196, + "grad_norm": 1.4645849466323853, + "learning_rate": 2.8469750889679715e-07, + "loss": 0.9389, + "mean_token_accuracy": 0.7234993875026703, + "num_tokens": 46372039.0, + "step": 161 + }, + { + "epoch": 0.057702582368655386, + "grad_norm": 1.3678747415542603, + "learning_rate": 2.8647686832740214e-07, + "loss": 0.9582, + "mean_token_accuracy": 0.7133847326040268, + "num_tokens": 46660330.0, + "step": 162 + }, + { + "epoch": 0.05805877114870882, + "grad_norm": 1.8124463558197021, + "learning_rate": 2.882562277580071e-07, + "loss": 0.9983, + "mean_token_accuracy": 0.7073681950569153, + "num_tokens": 46927778.0, + "step": 163 + }, + { + "epoch": 0.05841495992876224, + "grad_norm": 1.9757546186447144, + "learning_rate": 2.900355871886121e-07, + "loss": 0.9496, + "mean_token_accuracy": 0.7165465503931046, + "num_tokens": 47173474.0, + "step": 164 + }, + { + "epoch": 0.058771148708815675, + "grad_norm": 1.3054084777832031, + "learning_rate": 2.918149466192171e-07, + "loss": 0.9349, + "mean_token_accuracy": 0.7263690233230591, + "num_tokens": 47466265.0, + "step": 165 + }, + { + "epoch": 0.0591273374888691, + "grad_norm": 1.8284335136413574, + "learning_rate": 2.9359430604982203e-07, + "loss": 0.9821, + "mean_token_accuracy": 0.7157955020666122, + "num_tokens": 47738587.0, + "step": 166 + }, + { + "epoch": 0.05948352626892253, + "grad_norm": 1.7446715831756592, + "learning_rate": 2.95373665480427e-07, + "loss": 0.9908, + "mean_token_accuracy": 0.7045240700244904, + "num_tokens": 48011250.0, + "step": 167 + }, + { + "epoch": 0.05983971504897596, + "grad_norm": 1.1526044607162476, + "learning_rate": 2.97153024911032e-07, + "loss": 1.0127, + "mean_token_accuracy": 0.7009474635124207, + "num_tokens": 48288302.0, + "step": 168 + }, + { + "epoch": 0.06019590382902939, + "grad_norm": 1.5354244709014893, + "learning_rate": 2.98932384341637e-07, + "loss": 1.0078, + "mean_token_accuracy": 0.710019662976265, + "num_tokens": 48571209.0, + "step": 169 + }, + { + "epoch": 0.060552092609082814, + "grad_norm": 1.2896158695220947, + "learning_rate": 3.0071174377224197e-07, + "loss": 0.8908, + "mean_token_accuracy": 0.7338657975196838, + "num_tokens": 48873101.0, + "step": 170 + }, + { + "epoch": 0.060908281389136246, + "grad_norm": 1.375154972076416, + "learning_rate": 3.02491103202847e-07, + "loss": 0.9395, + "mean_token_accuracy": 0.7218189537525177, + "num_tokens": 49169830.0, + "step": 171 + }, + { + "epoch": 0.06126447016918967, + "grad_norm": 1.2340350151062012, + "learning_rate": 3.0427046263345194e-07, + "loss": 0.9783, + "mean_token_accuracy": 0.7145285308361053, + "num_tokens": 49441384.0, + "step": 172 + }, + { + "epoch": 0.061620658949243096, + "grad_norm": 1.7818063497543335, + "learning_rate": 3.0604982206405693e-07, + "loss": 0.9749, + "mean_token_accuracy": 0.711155578494072, + "num_tokens": 49701699.0, + "step": 173 + }, + { + "epoch": 0.06197684772929653, + "grad_norm": 1.6867454051971436, + "learning_rate": 3.078291814946619e-07, + "loss": 0.939, + "mean_token_accuracy": 0.7225231230258942, + "num_tokens": 49985058.0, + "step": 174 + }, + { + "epoch": 0.06233303650934995, + "grad_norm": 1.720047950744629, + "learning_rate": 3.096085409252669e-07, + "loss": 0.9524, + "mean_token_accuracy": 0.7218315601348877, + "num_tokens": 50260390.0, + "step": 175 + }, + { + "epoch": 0.06268922528940338, + "grad_norm": 1.7641578912734985, + "learning_rate": 3.113879003558719e-07, + "loss": 0.9397, + "mean_token_accuracy": 0.7213225066661835, + "num_tokens": 50550056.0, + "step": 176 + }, + { + "epoch": 0.06304541406945681, + "grad_norm": 1.7139544486999512, + "learning_rate": 3.1316725978647687e-07, + "loss": 0.9245, + "mean_token_accuracy": 0.7253742963075638, + "num_tokens": 50846658.0, + "step": 177 + }, + { + "epoch": 0.06340160284951024, + "grad_norm": 1.1664118766784668, + "learning_rate": 3.149466192170818e-07, + "loss": 0.8802, + "mean_token_accuracy": 0.7368774116039276, + "num_tokens": 51135269.0, + "step": 178 + }, + { + "epoch": 0.06375779162956367, + "grad_norm": 1.189721941947937, + "learning_rate": 3.167259786476868e-07, + "loss": 0.8413, + "mean_token_accuracy": 0.7451521009206772, + "num_tokens": 51396521.0, + "step": 179 + }, + { + "epoch": 0.06411398040961709, + "grad_norm": 1.2477748394012451, + "learning_rate": 3.1850533807829177e-07, + "loss": 0.9578, + "mean_token_accuracy": 0.7213127613067627, + "num_tokens": 51680408.0, + "step": 180 + }, + { + "epoch": 0.06447016918967052, + "grad_norm": 1.2373051643371582, + "learning_rate": 3.202846975088968e-07, + "loss": 0.9498, + "mean_token_accuracy": 0.7205573916435242, + "num_tokens": 51961484.0, + "step": 181 + }, + { + "epoch": 0.06482635796972396, + "grad_norm": 1.2442598342895508, + "learning_rate": 3.220640569395018e-07, + "loss": 0.9316, + "mean_token_accuracy": 0.7240148335695267, + "num_tokens": 52238788.0, + "step": 182 + }, + { + "epoch": 0.06518254674977739, + "grad_norm": 1.1356416940689087, + "learning_rate": 3.238434163701068e-07, + "loss": 0.9058, + "mean_token_accuracy": 0.7354471236467361, + "num_tokens": 52526349.0, + "step": 183 + }, + { + "epoch": 0.0655387355298308, + "grad_norm": 1.379026174545288, + "learning_rate": 3.256227758007117e-07, + "loss": 0.8363, + "mean_token_accuracy": 0.742713674902916, + "num_tokens": 52799363.0, + "step": 184 + }, + { + "epoch": 0.06589492430988424, + "grad_norm": 1.3388680219650269, + "learning_rate": 3.274021352313167e-07, + "loss": 0.9288, + "mean_token_accuracy": 0.7266281247138977, + "num_tokens": 53088669.0, + "step": 185 + }, + { + "epoch": 0.06625111308993767, + "grad_norm": 1.452131748199463, + "learning_rate": 3.291814946619217e-07, + "loss": 0.9863, + "mean_token_accuracy": 0.7114994376897812, + "num_tokens": 53349138.0, + "step": 186 + }, + { + "epoch": 0.0666073018699911, + "grad_norm": 1.287329912185669, + "learning_rate": 3.3096085409252667e-07, + "loss": 0.9784, + "mean_token_accuracy": 0.713035061955452, + "num_tokens": 53621276.0, + "step": 187 + }, + { + "epoch": 0.06696349065004452, + "grad_norm": 1.224009394645691, + "learning_rate": 3.3274021352313166e-07, + "loss": 0.9134, + "mean_token_accuracy": 0.7249868661165237, + "num_tokens": 53911288.0, + "step": 188 + }, + { + "epoch": 0.06731967943009795, + "grad_norm": 1.0197227001190186, + "learning_rate": 3.3451957295373664e-07, + "loss": 0.9804, + "mean_token_accuracy": 0.717997744679451, + "num_tokens": 54217029.0, + "step": 189 + }, + { + "epoch": 0.06767586821015138, + "grad_norm": 1.3060327768325806, + "learning_rate": 3.3629893238434163e-07, + "loss": 0.929, + "mean_token_accuracy": 0.7248048186302185, + "num_tokens": 54491859.0, + "step": 190 + }, + { + "epoch": 0.06803205699020481, + "grad_norm": 1.1954401731491089, + "learning_rate": 3.380782918149466e-07, + "loss": 0.9325, + "mean_token_accuracy": 0.7274532467126846, + "num_tokens": 54741823.0, + "step": 191 + }, + { + "epoch": 0.06838824577025823, + "grad_norm": 1.14543616771698, + "learning_rate": 3.398576512455516e-07, + "loss": 0.8728, + "mean_token_accuracy": 0.7356690466403961, + "num_tokens": 55032311.0, + "step": 192 + }, + { + "epoch": 0.06874443455031166, + "grad_norm": 1.17792809009552, + "learning_rate": 3.416370106761566e-07, + "loss": 0.9416, + "mean_token_accuracy": 0.7182083427906036, + "num_tokens": 55338737.0, + "step": 193 + }, + { + "epoch": 0.0691006233303651, + "grad_norm": 1.3479676246643066, + "learning_rate": 3.4341637010676157e-07, + "loss": 0.943, + "mean_token_accuracy": 0.7211614549160004, + "num_tokens": 55621298.0, + "step": 194 + }, + { + "epoch": 0.06945681211041853, + "grad_norm": 1.0057071447372437, + "learning_rate": 3.4519572953736656e-07, + "loss": 0.9464, + "mean_token_accuracy": 0.7189490646123886, + "num_tokens": 55904700.0, + "step": 195 + }, + { + "epoch": 0.06981300089047195, + "grad_norm": 1.1338746547698975, + "learning_rate": 3.469750889679715e-07, + "loss": 0.872, + "mean_token_accuracy": 0.738811194896698, + "num_tokens": 56235623.0, + "step": 196 + }, + { + "epoch": 0.07016918967052538, + "grad_norm": 1.1938508749008179, + "learning_rate": 3.4875444839857647e-07, + "loss": 0.9774, + "mean_token_accuracy": 0.7152837216854095, + "num_tokens": 56554584.0, + "step": 197 + }, + { + "epoch": 0.07052537845057881, + "grad_norm": 1.0458942651748657, + "learning_rate": 3.5053380782918146e-07, + "loss": 0.9612, + "mean_token_accuracy": 0.7160991877317429, + "num_tokens": 56850930.0, + "step": 198 + }, + { + "epoch": 0.07088156723063224, + "grad_norm": 1.0997376441955566, + "learning_rate": 3.5231316725978644e-07, + "loss": 0.9726, + "mean_token_accuracy": 0.7133037447929382, + "num_tokens": 57113699.0, + "step": 199 + }, + { + "epoch": 0.07123775601068566, + "grad_norm": 1.0956315994262695, + "learning_rate": 3.540925266903915e-07, + "loss": 0.9243, + "mean_token_accuracy": 0.7312521040439606, + "num_tokens": 57378170.0, + "step": 200 + }, + { + "epoch": 0.07159394479073909, + "grad_norm": 0.9628228545188904, + "learning_rate": 3.5587188612099647e-07, + "loss": 0.9501, + "mean_token_accuracy": 0.7200247198343277, + "num_tokens": 57673614.0, + "step": 201 + }, + { + "epoch": 0.07195013357079252, + "grad_norm": 1.0821253061294556, + "learning_rate": 3.576512455516014e-07, + "loss": 0.8856, + "mean_token_accuracy": 0.7318536043167114, + "num_tokens": 57967117.0, + "step": 202 + }, + { + "epoch": 0.07230632235084594, + "grad_norm": 0.9481870532035828, + "learning_rate": 3.594306049822064e-07, + "loss": 0.9414, + "mean_token_accuracy": 0.7191274613142014, + "num_tokens": 58246794.0, + "step": 203 + }, + { + "epoch": 0.07266251113089937, + "grad_norm": 1.1378796100616455, + "learning_rate": 3.6120996441281137e-07, + "loss": 0.9001, + "mean_token_accuracy": 0.7287804782390594, + "num_tokens": 58533753.0, + "step": 204 + }, + { + "epoch": 0.0730186999109528, + "grad_norm": 1.0235555171966553, + "learning_rate": 3.6298932384341636e-07, + "loss": 0.9866, + "mean_token_accuracy": 0.7124650180339813, + "num_tokens": 58829179.0, + "step": 205 + }, + { + "epoch": 0.07337488869100624, + "grad_norm": 0.9505153298377991, + "learning_rate": 3.6476868327402134e-07, + "loss": 0.949, + "mean_token_accuracy": 0.7231602370738983, + "num_tokens": 59144443.0, + "step": 206 + }, + { + "epoch": 0.07373107747105966, + "grad_norm": 1.016247272491455, + "learning_rate": 3.6654804270462633e-07, + "loss": 0.9676, + "mean_token_accuracy": 0.7130314856767654, + "num_tokens": 59429983.0, + "step": 207 + }, + { + "epoch": 0.07408726625111309, + "grad_norm": 1.0011279582977295, + "learning_rate": 3.6832740213523126e-07, + "loss": 0.8508, + "mean_token_accuracy": 0.741634801030159, + "num_tokens": 59705103.0, + "step": 208 + }, + { + "epoch": 0.07444345503116652, + "grad_norm": 1.1595584154129028, + "learning_rate": 3.7010676156583625e-07, + "loss": 0.8973, + "mean_token_accuracy": 0.7332644611597061, + "num_tokens": 60022184.0, + "step": 209 + }, + { + "epoch": 0.07479964381121995, + "grad_norm": 1.087066888809204, + "learning_rate": 3.718861209964413e-07, + "loss": 0.9147, + "mean_token_accuracy": 0.7290670573711395, + "num_tokens": 60284241.0, + "step": 210 + }, + { + "epoch": 0.07515583259127337, + "grad_norm": 1.0040203332901, + "learning_rate": 3.7366548042704627e-07, + "loss": 0.8493, + "mean_token_accuracy": 0.7434796839952469, + "num_tokens": 60568312.0, + "step": 211 + }, + { + "epoch": 0.0755120213713268, + "grad_norm": 1.4361259937286377, + "learning_rate": 3.7544483985765126e-07, + "loss": 0.9902, + "mean_token_accuracy": 0.7090462446212769, + "num_tokens": 60832749.0, + "step": 212 + }, + { + "epoch": 0.07586821015138023, + "grad_norm": 0.8490960597991943, + "learning_rate": 3.7722419928825624e-07, + "loss": 0.9658, + "mean_token_accuracy": 0.7133458405733109, + "num_tokens": 61151656.0, + "step": 213 + }, + { + "epoch": 0.07622439893143367, + "grad_norm": 0.9031754732131958, + "learning_rate": 3.790035587188612e-07, + "loss": 0.9036, + "mean_token_accuracy": 0.7334821820259094, + "num_tokens": 61423655.0, + "step": 214 + }, + { + "epoch": 0.07658058771148708, + "grad_norm": 0.9833385348320007, + "learning_rate": 3.8078291814946616e-07, + "loss": 1.0264, + "mean_token_accuracy": 0.7014185041189194, + "num_tokens": 61717527.0, + "step": 215 + }, + { + "epoch": 0.07693677649154052, + "grad_norm": 0.9098270535469055, + "learning_rate": 3.8256227758007115e-07, + "loss": 0.9501, + "mean_token_accuracy": 0.7173158973455429, + "num_tokens": 62010409.0, + "step": 216 + }, + { + "epoch": 0.07729296527159395, + "grad_norm": 0.9253700971603394, + "learning_rate": 3.8434163701067613e-07, + "loss": 0.9016, + "mean_token_accuracy": 0.7333426624536514, + "num_tokens": 62288277.0, + "step": 217 + }, + { + "epoch": 0.07764915405164738, + "grad_norm": 0.9505442976951599, + "learning_rate": 3.861209964412811e-07, + "loss": 0.8968, + "mean_token_accuracy": 0.7301357388496399, + "num_tokens": 62583011.0, + "step": 218 + }, + { + "epoch": 0.0780053428317008, + "grad_norm": 0.9951704144477844, + "learning_rate": 3.879003558718861e-07, + "loss": 0.991, + "mean_token_accuracy": 0.7121406197547913, + "num_tokens": 62867257.0, + "step": 219 + }, + { + "epoch": 0.07836153161175423, + "grad_norm": 0.8722975850105286, + "learning_rate": 3.896797153024911e-07, + "loss": 0.9219, + "mean_token_accuracy": 0.7296988666057587, + "num_tokens": 63172990.0, + "step": 220 + }, + { + "epoch": 0.07871772039180766, + "grad_norm": 0.9205854535102844, + "learning_rate": 3.9145907473309607e-07, + "loss": 0.938, + "mean_token_accuracy": 0.7176374793052673, + "num_tokens": 63457304.0, + "step": 221 + }, + { + "epoch": 0.07907390917186109, + "grad_norm": 0.938025712966919, + "learning_rate": 3.9323843416370106e-07, + "loss": 0.8598, + "mean_token_accuracy": 0.7388752549886703, + "num_tokens": 63720535.0, + "step": 222 + }, + { + "epoch": 0.07943009795191451, + "grad_norm": 0.9406570196151733, + "learning_rate": 3.9501779359430604e-07, + "loss": 0.9674, + "mean_token_accuracy": 0.7136155664920807, + "num_tokens": 64028234.0, + "step": 223 + }, + { + "epoch": 0.07978628673196794, + "grad_norm": 0.8822739720344543, + "learning_rate": 3.9679715302491103e-07, + "loss": 0.94, + "mean_token_accuracy": 0.7191321104764938, + "num_tokens": 64306031.0, + "step": 224 + }, + { + "epoch": 0.08014247551202137, + "grad_norm": 0.7731105089187622, + "learning_rate": 3.98576512455516e-07, + "loss": 0.922, + "mean_token_accuracy": 0.7244584262371063, + "num_tokens": 64601329.0, + "step": 225 + }, + { + "epoch": 0.0804986642920748, + "grad_norm": 0.8288493156433105, + "learning_rate": 4.0035587188612095e-07, + "loss": 0.8916, + "mean_token_accuracy": 0.7345909029245377, + "num_tokens": 64886062.0, + "step": 226 + }, + { + "epoch": 0.08085485307212822, + "grad_norm": 0.903895914554596, + "learning_rate": 4.0213523131672593e-07, + "loss": 0.8624, + "mean_token_accuracy": 0.7449877709150314, + "num_tokens": 65172938.0, + "step": 227 + }, + { + "epoch": 0.08121104185218166, + "grad_norm": 0.8036900162696838, + "learning_rate": 4.039145907473309e-07, + "loss": 0.8939, + "mean_token_accuracy": 0.726237028837204, + "num_tokens": 65470290.0, + "step": 228 + }, + { + "epoch": 0.08156723063223509, + "grad_norm": 0.8633949160575867, + "learning_rate": 4.0569395017793596e-07, + "loss": 0.9363, + "mean_token_accuracy": 0.7233675420284271, + "num_tokens": 65726452.0, + "step": 229 + }, + { + "epoch": 0.0819234194122885, + "grad_norm": 0.8833683133125305, + "learning_rate": 4.0747330960854094e-07, + "loss": 0.9336, + "mean_token_accuracy": 0.7289762049913406, + "num_tokens": 66021443.0, + "step": 230 + }, + { + "epoch": 0.08227960819234194, + "grad_norm": 0.802463710308075, + "learning_rate": 4.0925266903914593e-07, + "loss": 0.9238, + "mean_token_accuracy": 0.7256439179182053, + "num_tokens": 66311278.0, + "step": 231 + }, + { + "epoch": 0.08263579697239537, + "grad_norm": 0.8124948143959045, + "learning_rate": 4.1103202846975086e-07, + "loss": 0.8851, + "mean_token_accuracy": 0.7329772114753723, + "num_tokens": 66610996.0, + "step": 232 + }, + { + "epoch": 0.0829919857524488, + "grad_norm": 0.7409277558326721, + "learning_rate": 4.1281138790035585e-07, + "loss": 0.9284, + "mean_token_accuracy": 0.7283185124397278, + "num_tokens": 66914197.0, + "step": 233 + }, + { + "epoch": 0.08334817453250222, + "grad_norm": 0.8709957003593445, + "learning_rate": 4.1459074733096083e-07, + "loss": 0.9397, + "mean_token_accuracy": 0.71269890666008, + "num_tokens": 67197245.0, + "step": 234 + }, + { + "epoch": 0.08370436331255565, + "grad_norm": 0.8129553198814392, + "learning_rate": 4.163701067615658e-07, + "loss": 0.9101, + "mean_token_accuracy": 0.7263952791690826, + "num_tokens": 67508514.0, + "step": 235 + }, + { + "epoch": 0.08406055209260908, + "grad_norm": 0.7683774828910828, + "learning_rate": 4.181494661921708e-07, + "loss": 0.8959, + "mean_token_accuracy": 0.7313990741968155, + "num_tokens": 67813660.0, + "step": 236 + }, + { + "epoch": 0.08441674087266252, + "grad_norm": 0.7996459603309631, + "learning_rate": 4.199288256227758e-07, + "loss": 0.8986, + "mean_token_accuracy": 0.7292490452528, + "num_tokens": 68108855.0, + "step": 237 + }, + { + "epoch": 0.08477292965271593, + "grad_norm": 0.7957763671875, + "learning_rate": 4.217081850533807e-07, + "loss": 0.8697, + "mean_token_accuracy": 0.7351362109184265, + "num_tokens": 68394908.0, + "step": 238 + }, + { + "epoch": 0.08512911843276937, + "grad_norm": 0.7882890105247498, + "learning_rate": 4.2348754448398576e-07, + "loss": 0.8543, + "mean_token_accuracy": 0.7378031462430954, + "num_tokens": 68689415.0, + "step": 239 + }, + { + "epoch": 0.0854853072128228, + "grad_norm": 0.7840797901153564, + "learning_rate": 4.2526690391459074e-07, + "loss": 0.8765, + "mean_token_accuracy": 0.7363589107990265, + "num_tokens": 68996101.0, + "step": 240 + }, + { + "epoch": 0.08584149599287623, + "grad_norm": 0.7902876138687134, + "learning_rate": 4.2704626334519573e-07, + "loss": 0.8555, + "mean_token_accuracy": 0.7404786050319672, + "num_tokens": 69336998.0, + "step": 241 + }, + { + "epoch": 0.08619768477292965, + "grad_norm": 0.8812615871429443, + "learning_rate": 4.288256227758007e-07, + "loss": 0.8536, + "mean_token_accuracy": 0.7386437207460403, + "num_tokens": 69594338.0, + "step": 242 + }, + { + "epoch": 0.08655387355298308, + "grad_norm": 0.8071882724761963, + "learning_rate": 4.306049822064057e-07, + "loss": 0.8372, + "mean_token_accuracy": 0.7481125295162201, + "num_tokens": 69880963.0, + "step": 243 + }, + { + "epoch": 0.08691006233303651, + "grad_norm": 0.8380977511405945, + "learning_rate": 4.3238434163701063e-07, + "loss": 0.897, + "mean_token_accuracy": 0.7291272133588791, + "num_tokens": 70153339.0, + "step": 244 + }, + { + "epoch": 0.08726625111308994, + "grad_norm": 0.7946248650550842, + "learning_rate": 4.341637010676156e-07, + "loss": 0.9229, + "mean_token_accuracy": 0.7298202812671661, + "num_tokens": 70411732.0, + "step": 245 + }, + { + "epoch": 0.08762243989314336, + "grad_norm": 0.7756784558296204, + "learning_rate": 4.359430604982206e-07, + "loss": 0.8225, + "mean_token_accuracy": 0.7491913586854935, + "num_tokens": 70709978.0, + "step": 246 + }, + { + "epoch": 0.0879786286731968, + "grad_norm": 0.813480794429779, + "learning_rate": 4.377224199288256e-07, + "loss": 0.8698, + "mean_token_accuracy": 0.7357879877090454, + "num_tokens": 70994912.0, + "step": 247 + }, + { + "epoch": 0.08833481745325023, + "grad_norm": 0.7579056024551392, + "learning_rate": 4.395017793594306e-07, + "loss": 0.82, + "mean_token_accuracy": 0.7521076202392578, + "num_tokens": 71299985.0, + "step": 248 + }, + { + "epoch": 0.08869100623330366, + "grad_norm": 0.8700812458992004, + "learning_rate": 4.412811387900356e-07, + "loss": 0.9272, + "mean_token_accuracy": 0.7261626571416855, + "num_tokens": 71546243.0, + "step": 249 + }, + { + "epoch": 0.08904719501335707, + "grad_norm": 0.7492097616195679, + "learning_rate": 4.4306049822064055e-07, + "loss": 0.8738, + "mean_token_accuracy": 0.7369857728481293, + "num_tokens": 71840316.0, + "step": 250 + }, + { + "epoch": 0.0894033837934105, + "grad_norm": 0.8046805262565613, + "learning_rate": 4.4483985765124553e-07, + "loss": 0.9256, + "mean_token_accuracy": 0.7204148173332214, + "num_tokens": 72128625.0, + "step": 251 + }, + { + "epoch": 0.08975957257346394, + "grad_norm": 0.7837573289871216, + "learning_rate": 4.466192170818505e-07, + "loss": 0.8467, + "mean_token_accuracy": 0.7472769170999527, + "num_tokens": 72400373.0, + "step": 252 + }, + { + "epoch": 0.09011576135351737, + "grad_norm": 0.8106136918067932, + "learning_rate": 4.483985765124555e-07, + "loss": 0.9241, + "mean_token_accuracy": 0.7219822406768799, + "num_tokens": 72680175.0, + "step": 253 + }, + { + "epoch": 0.09047195013357079, + "grad_norm": 0.8206344246864319, + "learning_rate": 4.501779359430605e-07, + "loss": 0.986, + "mean_token_accuracy": 0.7124885767698288, + "num_tokens": 72961207.0, + "step": 254 + }, + { + "epoch": 0.09082813891362422, + "grad_norm": 0.8098937273025513, + "learning_rate": 4.519572953736655e-07, + "loss": 0.87, + "mean_token_accuracy": 0.7357624173164368, + "num_tokens": 73250045.0, + "step": 255 + }, + { + "epoch": 0.09118432769367765, + "grad_norm": 0.8206485509872437, + "learning_rate": 4.537366548042704e-07, + "loss": 0.8769, + "mean_token_accuracy": 0.7374915480613708, + "num_tokens": 73502028.0, + "step": 256 + }, + { + "epoch": 0.09154051647373107, + "grad_norm": 0.7927455902099609, + "learning_rate": 4.555160142348754e-07, + "loss": 0.9274, + "mean_token_accuracy": 0.7221433073282242, + "num_tokens": 73775048.0, + "step": 257 + }, + { + "epoch": 0.0918967052537845, + "grad_norm": 0.8188623785972595, + "learning_rate": 4.5729537366548043e-07, + "loss": 0.845, + "mean_token_accuracy": 0.7420574724674225, + "num_tokens": 74079658.0, + "step": 258 + }, + { + "epoch": 0.09225289403383793, + "grad_norm": 0.782132089138031, + "learning_rate": 4.590747330960854e-07, + "loss": 0.881, + "mean_token_accuracy": 0.7349693924188614, + "num_tokens": 74368880.0, + "step": 259 + }, + { + "epoch": 0.09260908281389137, + "grad_norm": 0.7231023907661438, + "learning_rate": 4.608540925266904e-07, + "loss": 0.8593, + "mean_token_accuracy": 0.7387087494134903, + "num_tokens": 74679316.0, + "step": 260 + }, + { + "epoch": 0.09296527159394478, + "grad_norm": 0.7408156991004944, + "learning_rate": 4.626334519572954e-07, + "loss": 0.9487, + "mean_token_accuracy": 0.7170867919921875, + "num_tokens": 74973219.0, + "step": 261 + }, + { + "epoch": 0.09332146037399822, + "grad_norm": 0.8301622867584229, + "learning_rate": 4.644128113879003e-07, + "loss": 0.962, + "mean_token_accuracy": 0.718606173992157, + "num_tokens": 75227919.0, + "step": 262 + }, + { + "epoch": 0.09367764915405165, + "grad_norm": 0.797057569026947, + "learning_rate": 4.661921708185053e-07, + "loss": 0.8832, + "mean_token_accuracy": 0.7350175827741623, + "num_tokens": 75521764.0, + "step": 263 + }, + { + "epoch": 0.09403383793410508, + "grad_norm": 0.7751001119613647, + "learning_rate": 4.679715302491103e-07, + "loss": 0.8549, + "mean_token_accuracy": 0.74117012321949, + "num_tokens": 75786862.0, + "step": 264 + }, + { + "epoch": 0.0943900267141585, + "grad_norm": 0.7520714402198792, + "learning_rate": 4.697508896797153e-07, + "loss": 0.9163, + "mean_token_accuracy": 0.7288280129432678, + "num_tokens": 76053531.0, + "step": 265 + }, + { + "epoch": 0.09474621549421193, + "grad_norm": 0.7722023725509644, + "learning_rate": 4.7153024911032026e-07, + "loss": 0.836, + "mean_token_accuracy": 0.7428942024707794, + "num_tokens": 76311636.0, + "step": 266 + }, + { + "epoch": 0.09510240427426536, + "grad_norm": 0.7364140152931213, + "learning_rate": 4.733096085409252e-07, + "loss": 0.9216, + "mean_token_accuracy": 0.7233846038579941, + "num_tokens": 76629103.0, + "step": 267 + }, + { + "epoch": 0.0954585930543188, + "grad_norm": 0.7300450205802917, + "learning_rate": 4.7508896797153023e-07, + "loss": 0.8781, + "mean_token_accuracy": 0.7301182001829147, + "num_tokens": 76933835.0, + "step": 268 + }, + { + "epoch": 0.09581478183437221, + "grad_norm": 0.7008783221244812, + "learning_rate": 4.768683274021353e-07, + "loss": 0.7978, + "mean_token_accuracy": 0.7568300068378448, + "num_tokens": 77268880.0, + "step": 269 + }, + { + "epoch": 0.09617097061442564, + "grad_norm": 0.728410005569458, + "learning_rate": 4.786476868327403e-07, + "loss": 0.8181, + "mean_token_accuracy": 0.7519714683294296, + "num_tokens": 77545763.0, + "step": 270 + }, + { + "epoch": 0.09652715939447908, + "grad_norm": 0.7763772010803223, + "learning_rate": 4.804270462633451e-07, + "loss": 0.868, + "mean_token_accuracy": 0.7399906516075134, + "num_tokens": 77839135.0, + "step": 271 + }, + { + "epoch": 0.09688334817453251, + "grad_norm": 0.7447232604026794, + "learning_rate": 4.822064056939501e-07, + "loss": 0.8685, + "mean_token_accuracy": 0.7359241843223572, + "num_tokens": 78129281.0, + "step": 272 + }, + { + "epoch": 0.09723953695458593, + "grad_norm": 0.7289950847625732, + "learning_rate": 4.839857651245551e-07, + "loss": 0.8897, + "mean_token_accuracy": 0.7318876683712006, + "num_tokens": 78430210.0, + "step": 273 + }, + { + "epoch": 0.09759572573463936, + "grad_norm": 0.7246371507644653, + "learning_rate": 4.857651245551601e-07, + "loss": 0.9114, + "mean_token_accuracy": 0.7225060760974884, + "num_tokens": 78733201.0, + "step": 274 + }, + { + "epoch": 0.09795191451469279, + "grad_norm": 0.7178669571876526, + "learning_rate": 4.875444839857651e-07, + "loss": 0.8885, + "mean_token_accuracy": 0.7293380051851273, + "num_tokens": 79040176.0, + "step": 275 + }, + { + "epoch": 0.09830810329474622, + "grad_norm": 0.7635350227355957, + "learning_rate": 4.893238434163701e-07, + "loss": 0.8747, + "mean_token_accuracy": 0.7338764816522598, + "num_tokens": 79318150.0, + "step": 276 + }, + { + "epoch": 0.09866429207479964, + "grad_norm": 0.717993438243866, + "learning_rate": 4.91103202846975e-07, + "loss": 0.8385, + "mean_token_accuracy": 0.746566504240036, + "num_tokens": 79616527.0, + "step": 277 + }, + { + "epoch": 0.09902048085485307, + "grad_norm": 0.7544050812721252, + "learning_rate": 4.9288256227758e-07, + "loss": 0.9234, + "mean_token_accuracy": 0.7228162586688995, + "num_tokens": 79896492.0, + "step": 278 + }, + { + "epoch": 0.0993766696349065, + "grad_norm": 0.7673467397689819, + "learning_rate": 4.94661921708185e-07, + "loss": 0.7751, + "mean_token_accuracy": 0.7576700747013092, + "num_tokens": 80145670.0, + "step": 279 + }, + { + "epoch": 0.09973285841495994, + "grad_norm": 0.7377071976661682, + "learning_rate": 4.9644128113879e-07, + "loss": 0.851, + "mean_token_accuracy": 0.7369372695684433, + "num_tokens": 80425907.0, + "step": 280 + }, + { + "epoch": 0.10008904719501335, + "grad_norm": 0.7305501699447632, + "learning_rate": 4.98220640569395e-07, + "loss": 0.8806, + "mean_token_accuracy": 0.7321944683790207, + "num_tokens": 80708591.0, + "step": 281 + }, + { + "epoch": 0.10044523597506679, + "grad_norm": 0.7451775670051575, + "learning_rate": 5e-07, + "loss": 0.8865, + "mean_token_accuracy": 0.7362570315599442, + "num_tokens": 81010282.0, + "step": 282 + }, + { + "epoch": 0.10080142475512022, + "grad_norm": 0.8280680775642395, + "learning_rate": 5.01779359430605e-07, + "loss": 0.79, + "mean_token_accuracy": 0.756159707903862, + "num_tokens": 81291234.0, + "step": 283 + }, + { + "epoch": 0.10115761353517365, + "grad_norm": 0.7283955216407776, + "learning_rate": 5.0355871886121e-07, + "loss": 0.786, + "mean_token_accuracy": 0.7520495802164078, + "num_tokens": 81586265.0, + "step": 284 + }, + { + "epoch": 0.10151380231522707, + "grad_norm": 0.781836211681366, + "learning_rate": 5.053380782918149e-07, + "loss": 0.8948, + "mean_token_accuracy": 0.729786142706871, + "num_tokens": 81886535.0, + "step": 285 + }, + { + "epoch": 0.1018699910952805, + "grad_norm": 0.7353518009185791, + "learning_rate": 5.071174377224199e-07, + "loss": 0.9025, + "mean_token_accuracy": 0.7306560724973679, + "num_tokens": 82196028.0, + "step": 286 + }, + { + "epoch": 0.10222617987533393, + "grad_norm": 0.7548994421958923, + "learning_rate": 5.088967971530249e-07, + "loss": 0.8446, + "mean_token_accuracy": 0.7406260222196579, + "num_tokens": 82466610.0, + "step": 287 + }, + { + "epoch": 0.10258236865538735, + "grad_norm": 0.6988789439201355, + "learning_rate": 5.106761565836298e-07, + "loss": 0.8872, + "mean_token_accuracy": 0.7359012365341187, + "num_tokens": 82780218.0, + "step": 288 + }, + { + "epoch": 0.10293855743544078, + "grad_norm": 0.704159140586853, + "learning_rate": 5.124555160142349e-07, + "loss": 0.8439, + "mean_token_accuracy": 0.7479062378406525, + "num_tokens": 83076155.0, + "step": 289 + }, + { + "epoch": 0.10329474621549421, + "grad_norm": 0.8171045780181885, + "learning_rate": 5.142348754448398e-07, + "loss": 0.9524, + "mean_token_accuracy": 0.713659331202507, + "num_tokens": 83333641.0, + "step": 290 + }, + { + "epoch": 0.10365093499554764, + "grad_norm": 0.7702825665473938, + "learning_rate": 5.160142348754448e-07, + "loss": 0.8638, + "mean_token_accuracy": 0.7375694662332535, + "num_tokens": 83605214.0, + "step": 291 + }, + { + "epoch": 0.10400712377560106, + "grad_norm": 0.7506760358810425, + "learning_rate": 5.177935943060498e-07, + "loss": 0.9262, + "mean_token_accuracy": 0.7214473634958267, + "num_tokens": 83885092.0, + "step": 292 + }, + { + "epoch": 0.1043633125556545, + "grad_norm": 0.688535749912262, + "learning_rate": 5.195729537366548e-07, + "loss": 0.8974, + "mean_token_accuracy": 0.7216181308031082, + "num_tokens": 84210610.0, + "step": 293 + }, + { + "epoch": 0.10471950133570793, + "grad_norm": 0.7110162973403931, + "learning_rate": 5.213523131672598e-07, + "loss": 0.9064, + "mean_token_accuracy": 0.7277108877897263, + "num_tokens": 84501803.0, + "step": 294 + }, + { + "epoch": 0.10507569011576136, + "grad_norm": 0.7849475145339966, + "learning_rate": 5.231316725978647e-07, + "loss": 0.8716, + "mean_token_accuracy": 0.7389808893203735, + "num_tokens": 84776947.0, + "step": 295 + }, + { + "epoch": 0.10543187889581478, + "grad_norm": 0.734642744064331, + "learning_rate": 5.249110320284698e-07, + "loss": 0.8735, + "mean_token_accuracy": 0.7324835956096649, + "num_tokens": 85055395.0, + "step": 296 + }, + { + "epoch": 0.10578806767586821, + "grad_norm": 0.8439391851425171, + "learning_rate": 5.266903914590747e-07, + "loss": 0.9702, + "mean_token_accuracy": 0.7141130268573761, + "num_tokens": 85325290.0, + "step": 297 + }, + { + "epoch": 0.10614425645592164, + "grad_norm": 0.7593280673027039, + "learning_rate": 5.284697508896797e-07, + "loss": 0.9868, + "mean_token_accuracy": 0.7082487344741821, + "num_tokens": 85608937.0, + "step": 298 + }, + { + "epoch": 0.10650044523597507, + "grad_norm": 0.7393141388893127, + "learning_rate": 5.302491103202846e-07, + "loss": 0.8963, + "mean_token_accuracy": 0.7296396046876907, + "num_tokens": 85894111.0, + "step": 299 + }, + { + "epoch": 0.10685663401602849, + "grad_norm": 0.7413122057914734, + "learning_rate": 5.320284697508896e-07, + "loss": 0.8782, + "mean_token_accuracy": 0.7384017705917358, + "num_tokens": 86174652.0, + "step": 300 + }, + { + "epoch": 0.10721282279608192, + "grad_norm": 0.7287910580635071, + "learning_rate": 5.338078291814946e-07, + "loss": 0.851, + "mean_token_accuracy": 0.7374566048383713, + "num_tokens": 86443404.0, + "step": 301 + }, + { + "epoch": 0.10756901157613535, + "grad_norm": 0.7568879723548889, + "learning_rate": 5.355871886120996e-07, + "loss": 0.9329, + "mean_token_accuracy": 0.722750723361969, + "num_tokens": 86741084.0, + "step": 302 + }, + { + "epoch": 0.10792520035618879, + "grad_norm": 0.7113358974456787, + "learning_rate": 5.373665480427047e-07, + "loss": 0.9058, + "mean_token_accuracy": 0.7288830578327179, + "num_tokens": 87045874.0, + "step": 303 + }, + { + "epoch": 0.1082813891362422, + "grad_norm": 0.7508000731468201, + "learning_rate": 5.391459074733096e-07, + "loss": 0.9101, + "mean_token_accuracy": 0.723184734582901, + "num_tokens": 87312382.0, + "step": 304 + }, + { + "epoch": 0.10863757791629564, + "grad_norm": 0.7912318110466003, + "learning_rate": 5.409252669039146e-07, + "loss": 0.8803, + "mean_token_accuracy": 0.7351911962032318, + "num_tokens": 87582869.0, + "step": 305 + }, + { + "epoch": 0.10899376669634907, + "grad_norm": 0.718353271484375, + "learning_rate": 5.427046263345195e-07, + "loss": 0.8364, + "mean_token_accuracy": 0.7467453479766846, + "num_tokens": 87876658.0, + "step": 306 + }, + { + "epoch": 0.1093499554764025, + "grad_norm": 0.7193779945373535, + "learning_rate": 5.444839857651245e-07, + "loss": 0.8869, + "mean_token_accuracy": 0.7320912182331085, + "num_tokens": 88172792.0, + "step": 307 + }, + { + "epoch": 0.10970614425645592, + "grad_norm": 0.7346266508102417, + "learning_rate": 5.462633451957295e-07, + "loss": 0.8576, + "mean_token_accuracy": 0.7406808435916901, + "num_tokens": 88474301.0, + "step": 308 + }, + { + "epoch": 0.11006233303650935, + "grad_norm": 0.7276808619499207, + "learning_rate": 5.480427046263345e-07, + "loss": 0.8483, + "mean_token_accuracy": 0.7371818125247955, + "num_tokens": 88780136.0, + "step": 309 + }, + { + "epoch": 0.11041852181656278, + "grad_norm": 0.719054639339447, + "learning_rate": 5.498220640569395e-07, + "loss": 0.8994, + "mean_token_accuracy": 0.7315825819969177, + "num_tokens": 89074406.0, + "step": 310 + }, + { + "epoch": 0.11077471059661621, + "grad_norm": 0.7538154721260071, + "learning_rate": 5.516014234875445e-07, + "loss": 0.8273, + "mean_token_accuracy": 0.7443096041679382, + "num_tokens": 89348946.0, + "step": 311 + }, + { + "epoch": 0.11113089937666963, + "grad_norm": 0.7547476291656494, + "learning_rate": 5.533807829181495e-07, + "loss": 0.8766, + "mean_token_accuracy": 0.7368558049201965, + "num_tokens": 89636647.0, + "step": 312 + }, + { + "epoch": 0.11148708815672306, + "grad_norm": 0.680946946144104, + "learning_rate": 5.551601423487544e-07, + "loss": 0.8798, + "mean_token_accuracy": 0.7383662760257721, + "num_tokens": 89968489.0, + "step": 313 + }, + { + "epoch": 0.1118432769367765, + "grad_norm": 0.7139073610305786, + "learning_rate": 5.569395017793594e-07, + "loss": 0.8789, + "mean_token_accuracy": 0.7345536202192307, + "num_tokens": 90243451.0, + "step": 314 + }, + { + "epoch": 0.11219946571682991, + "grad_norm": 0.7120301127433777, + "learning_rate": 5.587188612099644e-07, + "loss": 0.8334, + "mean_token_accuracy": 0.7453904896974564, + "num_tokens": 90529172.0, + "step": 315 + }, + { + "epoch": 0.11255565449688335, + "grad_norm": 0.7588550448417664, + "learning_rate": 5.604982206405694e-07, + "loss": 0.8546, + "mean_token_accuracy": 0.7406832724809647, + "num_tokens": 90788376.0, + "step": 316 + }, + { + "epoch": 0.11291184327693678, + "grad_norm": 0.7450641989707947, + "learning_rate": 5.622775800711744e-07, + "loss": 0.9384, + "mean_token_accuracy": 0.7181587219238281, + "num_tokens": 91064804.0, + "step": 317 + }, + { + "epoch": 0.11326803205699021, + "grad_norm": 0.6943345069885254, + "learning_rate": 5.640569395017794e-07, + "loss": 0.8398, + "mean_token_accuracy": 0.7478576004505157, + "num_tokens": 91375524.0, + "step": 318 + }, + { + "epoch": 0.11362422083704363, + "grad_norm": 0.7227373123168945, + "learning_rate": 5.658362989323842e-07, + "loss": 0.8496, + "mean_token_accuracy": 0.7438309490680695, + "num_tokens": 91663776.0, + "step": 319 + }, + { + "epoch": 0.11398040961709706, + "grad_norm": 0.7788522839546204, + "learning_rate": 5.676156583629893e-07, + "loss": 0.8627, + "mean_token_accuracy": 0.7390216588973999, + "num_tokens": 91917226.0, + "step": 320 + }, + { + "epoch": 0.11433659839715049, + "grad_norm": 0.6776118278503418, + "learning_rate": 5.693950177935943e-07, + "loss": 0.8545, + "mean_token_accuracy": 0.7416425347328186, + "num_tokens": 92244619.0, + "step": 321 + }, + { + "epoch": 0.11469278717720392, + "grad_norm": 0.728538453578949, + "learning_rate": 5.711743772241993e-07, + "loss": 0.8397, + "mean_token_accuracy": 0.7468499839305878, + "num_tokens": 92534697.0, + "step": 322 + }, + { + "epoch": 0.11504897595725734, + "grad_norm": 0.7623713612556458, + "learning_rate": 5.729537366548043e-07, + "loss": 0.9026, + "mean_token_accuracy": 0.7304906100034714, + "num_tokens": 92802836.0, + "step": 323 + }, + { + "epoch": 0.11540516473731077, + "grad_norm": 0.7728430032730103, + "learning_rate": 5.747330960854092e-07, + "loss": 0.8487, + "mean_token_accuracy": 0.7437448054552078, + "num_tokens": 93065268.0, + "step": 324 + }, + { + "epoch": 0.1157613535173642, + "grad_norm": 0.6881632208824158, + "learning_rate": 5.765124555160142e-07, + "loss": 0.7985, + "mean_token_accuracy": 0.7572545409202576, + "num_tokens": 93376011.0, + "step": 325 + }, + { + "epoch": 0.11611754229741764, + "grad_norm": 0.7202650904655457, + "learning_rate": 5.782918149466191e-07, + "loss": 0.8406, + "mean_token_accuracy": 0.741943359375, + "num_tokens": 93669871.0, + "step": 326 + }, + { + "epoch": 0.11647373107747105, + "grad_norm": 0.7198943495750427, + "learning_rate": 5.800711743772242e-07, + "loss": 0.8104, + "mean_token_accuracy": 0.7529194951057434, + "num_tokens": 93984385.0, + "step": 327 + }, + { + "epoch": 0.11682991985752449, + "grad_norm": 0.723523736000061, + "learning_rate": 5.818505338078291e-07, + "loss": 0.8565, + "mean_token_accuracy": 0.7358394116163254, + "num_tokens": 94242455.0, + "step": 328 + }, + { + "epoch": 0.11718610863757792, + "grad_norm": 0.7085947394371033, + "learning_rate": 5.836298932384342e-07, + "loss": 0.8326, + "mean_token_accuracy": 0.744693249464035, + "num_tokens": 94558710.0, + "step": 329 + }, + { + "epoch": 0.11754229741763135, + "grad_norm": 0.6569757461547852, + "learning_rate": 5.854092526690391e-07, + "loss": 0.9059, + "mean_token_accuracy": 0.7271620035171509, + "num_tokens": 94872779.0, + "step": 330 + }, + { + "epoch": 0.11789848619768477, + "grad_norm": 0.6883683204650879, + "learning_rate": 5.871886120996441e-07, + "loss": 0.8252, + "mean_token_accuracy": 0.7465459704399109, + "num_tokens": 95172311.0, + "step": 331 + }, + { + "epoch": 0.1182546749777382, + "grad_norm": 0.7101216316223145, + "learning_rate": 5.889679715302491e-07, + "loss": 0.8756, + "mean_token_accuracy": 0.7296458184719086, + "num_tokens": 95443790.0, + "step": 332 + }, + { + "epoch": 0.11861086375779163, + "grad_norm": 0.6639065742492676, + "learning_rate": 5.90747330960854e-07, + "loss": 0.9145, + "mean_token_accuracy": 0.7236893177032471, + "num_tokens": 95766208.0, + "step": 333 + }, + { + "epoch": 0.11896705253784506, + "grad_norm": 0.705662727355957, + "learning_rate": 5.925266903914591e-07, + "loss": 0.8545, + "mean_token_accuracy": 0.7470201849937439, + "num_tokens": 96057827.0, + "step": 334 + }, + { + "epoch": 0.11932324131789848, + "grad_norm": 0.7108069658279419, + "learning_rate": 5.94306049822064e-07, + "loss": 0.8766, + "mean_token_accuracy": 0.7392838001251221, + "num_tokens": 96347799.0, + "step": 335 + }, + { + "epoch": 0.11967943009795191, + "grad_norm": 0.7228357791900635, + "learning_rate": 5.96085409252669e-07, + "loss": 0.8587, + "mean_token_accuracy": 0.7354053854942322, + "num_tokens": 96624476.0, + "step": 336 + }, + { + "epoch": 0.12003561887800535, + "grad_norm": 0.7306438684463501, + "learning_rate": 5.97864768683274e-07, + "loss": 0.9482, + "mean_token_accuracy": 0.7215323448181152, + "num_tokens": 96913276.0, + "step": 337 + }, + { + "epoch": 0.12039180765805878, + "grad_norm": 0.7549405097961426, + "learning_rate": 5.99644128113879e-07, + "loss": 0.8569, + "mean_token_accuracy": 0.7366433441638947, + "num_tokens": 97192257.0, + "step": 338 + }, + { + "epoch": 0.1207479964381122, + "grad_norm": 0.7334268689155579, + "learning_rate": 6.014234875444839e-07, + "loss": 0.9118, + "mean_token_accuracy": 0.7240595072507858, + "num_tokens": 97468263.0, + "step": 339 + }, + { + "epoch": 0.12110418521816563, + "grad_norm": 0.7349423766136169, + "learning_rate": 6.032028469750889e-07, + "loss": 0.8909, + "mean_token_accuracy": 0.7291035056114197, + "num_tokens": 97764438.0, + "step": 340 + }, + { + "epoch": 0.12146037399821906, + "grad_norm": 0.7242820262908936, + "learning_rate": 6.04982206405694e-07, + "loss": 0.889, + "mean_token_accuracy": 0.7355014830827713, + "num_tokens": 98037128.0, + "step": 341 + }, + { + "epoch": 0.12181656277827249, + "grad_norm": 0.7397364377975464, + "learning_rate": 6.067615658362989e-07, + "loss": 0.8759, + "mean_token_accuracy": 0.734344482421875, + "num_tokens": 98320996.0, + "step": 342 + }, + { + "epoch": 0.12217275155832591, + "grad_norm": 0.6856557130813599, + "learning_rate": 6.085409252669039e-07, + "loss": 0.9025, + "mean_token_accuracy": 0.7322548031806946, + "num_tokens": 98631826.0, + "step": 343 + }, + { + "epoch": 0.12252894033837934, + "grad_norm": 0.7015030980110168, + "learning_rate": 6.103202846975089e-07, + "loss": 0.7774, + "mean_token_accuracy": 0.7601407617330551, + "num_tokens": 98923130.0, + "step": 344 + }, + { + "epoch": 0.12288512911843277, + "grad_norm": 0.7607766389846802, + "learning_rate": 6.120996441281139e-07, + "loss": 0.8631, + "mean_token_accuracy": 0.7377814203500748, + "num_tokens": 99199095.0, + "step": 345 + }, + { + "epoch": 0.12324131789848619, + "grad_norm": 0.7089953422546387, + "learning_rate": 6.138790035587188e-07, + "loss": 0.8737, + "mean_token_accuracy": 0.7361233532428741, + "num_tokens": 99484813.0, + "step": 346 + }, + { + "epoch": 0.12359750667853962, + "grad_norm": 0.6936108469963074, + "learning_rate": 6.156583629893238e-07, + "loss": 0.8133, + "mean_token_accuracy": 0.752759650349617, + "num_tokens": 99783377.0, + "step": 347 + }, + { + "epoch": 0.12395369545859306, + "grad_norm": 0.7112177610397339, + "learning_rate": 6.174377224199287e-07, + "loss": 0.858, + "mean_token_accuracy": 0.7424372881650925, + "num_tokens": 100064381.0, + "step": 348 + }, + { + "epoch": 0.12430988423864649, + "grad_norm": 0.7029979825019836, + "learning_rate": 6.192170818505338e-07, + "loss": 0.8372, + "mean_token_accuracy": 0.7474607527256012, + "num_tokens": 100367608.0, + "step": 349 + }, + { + "epoch": 0.1246660730186999, + "grad_norm": 0.6773682832717896, + "learning_rate": 6.209964412811388e-07, + "loss": 0.8402, + "mean_token_accuracy": 0.745267316699028, + "num_tokens": 100676337.0, + "step": 350 + }, + { + "epoch": 0.12502226179875334, + "grad_norm": 0.7061133980751038, + "learning_rate": 6.227758007117438e-07, + "loss": 0.8482, + "mean_token_accuracy": 0.7436023503541946, + "num_tokens": 100958326.0, + "step": 351 + }, + { + "epoch": 0.12537845057880675, + "grad_norm": 0.7687330842018127, + "learning_rate": 6.245551601423488e-07, + "loss": 0.8093, + "mean_token_accuracy": 0.7460146546363831, + "num_tokens": 101241745.0, + "step": 352 + }, + { + "epoch": 0.1257346393588602, + "grad_norm": 0.7045417428016663, + "learning_rate": 6.263345195729537e-07, + "loss": 0.857, + "mean_token_accuracy": 0.7397503554821014, + "num_tokens": 101525653.0, + "step": 353 + }, + { + "epoch": 0.12609082813891362, + "grad_norm": 0.7289361953735352, + "learning_rate": 6.281138790035587e-07, + "loss": 0.8962, + "mean_token_accuracy": 0.7358068525791168, + "num_tokens": 101800683.0, + "step": 354 + }, + { + "epoch": 0.12644701691896706, + "grad_norm": 0.7443919777870178, + "learning_rate": 6.298932384341636e-07, + "loss": 0.9291, + "mean_token_accuracy": 0.7163188457489014, + "num_tokens": 102077595.0, + "step": 355 + }, + { + "epoch": 0.12680320569902048, + "grad_norm": 0.7073091268539429, + "learning_rate": 6.316725978647687e-07, + "loss": 0.852, + "mean_token_accuracy": 0.7382653951644897, + "num_tokens": 102362811.0, + "step": 356 + }, + { + "epoch": 0.1271593944790739, + "grad_norm": 0.7052621841430664, + "learning_rate": 6.334519572953736e-07, + "loss": 0.8794, + "mean_token_accuracy": 0.7324045747518539, + "num_tokens": 102650249.0, + "step": 357 + }, + { + "epoch": 0.12751558325912735, + "grad_norm": 0.6810793876647949, + "learning_rate": 6.352313167259787e-07, + "loss": 0.7765, + "mean_token_accuracy": 0.7542746812105179, + "num_tokens": 102948025.0, + "step": 358 + }, + { + "epoch": 0.12787177203918076, + "grad_norm": 0.7127146124839783, + "learning_rate": 6.370106761565835e-07, + "loss": 0.8555, + "mean_token_accuracy": 0.7372837662696838, + "num_tokens": 103237104.0, + "step": 359 + }, + { + "epoch": 0.12822796081923418, + "grad_norm": 0.6813823580741882, + "learning_rate": 6.387900355871885e-07, + "loss": 0.8548, + "mean_token_accuracy": 0.7443459182977676, + "num_tokens": 103546986.0, + "step": 360 + }, + { + "epoch": 0.12858414959928763, + "grad_norm": 0.7250629663467407, + "learning_rate": 6.405693950177936e-07, + "loss": 0.8706, + "mean_token_accuracy": 0.7310531139373779, + "num_tokens": 103814170.0, + "step": 361 + }, + { + "epoch": 0.12894033837934105, + "grad_norm": 0.7615689039230347, + "learning_rate": 6.423487544483985e-07, + "loss": 0.8318, + "mean_token_accuracy": 0.7451020777225494, + "num_tokens": 104065350.0, + "step": 362 + }, + { + "epoch": 0.1292965271593945, + "grad_norm": 0.7029721736907959, + "learning_rate": 6.441281138790036e-07, + "loss": 0.9011, + "mean_token_accuracy": 0.7285725176334381, + "num_tokens": 104378614.0, + "step": 363 + }, + { + "epoch": 0.1296527159394479, + "grad_norm": 0.716820478439331, + "learning_rate": 6.459074733096085e-07, + "loss": 0.9028, + "mean_token_accuracy": 0.7236183732748032, + "num_tokens": 104683350.0, + "step": 364 + }, + { + "epoch": 0.13000890471950133, + "grad_norm": 0.6978357434272766, + "learning_rate": 6.476868327402136e-07, + "loss": 0.8449, + "mean_token_accuracy": 0.7351332008838654, + "num_tokens": 104983252.0, + "step": 365 + }, + { + "epoch": 0.13036509349955477, + "grad_norm": 0.7074403166770935, + "learning_rate": 6.494661921708184e-07, + "loss": 0.8975, + "mean_token_accuracy": 0.7288137227296829, + "num_tokens": 105275818.0, + "step": 366 + }, + { + "epoch": 0.1307212822796082, + "grad_norm": 0.7287271022796631, + "learning_rate": 6.512455516014234e-07, + "loss": 0.8831, + "mean_token_accuracy": 0.7342758625745773, + "num_tokens": 105543951.0, + "step": 367 + }, + { + "epoch": 0.1310774710596616, + "grad_norm": 0.6816250085830688, + "learning_rate": 6.530249110320284e-07, + "loss": 0.8627, + "mean_token_accuracy": 0.7362540811300278, + "num_tokens": 105837958.0, + "step": 368 + }, + { + "epoch": 0.13143365983971506, + "grad_norm": 0.7080026268959045, + "learning_rate": 6.548042704626334e-07, + "loss": 0.852, + "mean_token_accuracy": 0.7435495108366013, + "num_tokens": 106157921.0, + "step": 369 + }, + { + "epoch": 0.13178984861976847, + "grad_norm": 0.6881864666938782, + "learning_rate": 6.565836298932385e-07, + "loss": 0.8163, + "mean_token_accuracy": 0.7475031018257141, + "num_tokens": 106458395.0, + "step": 370 + }, + { + "epoch": 0.1321460373998219, + "grad_norm": 0.7457827925682068, + "learning_rate": 6.583629893238434e-07, + "loss": 0.8452, + "mean_token_accuracy": 0.7381798923015594, + "num_tokens": 106713721.0, + "step": 371 + }, + { + "epoch": 0.13250222617987534, + "grad_norm": 0.7052323222160339, + "learning_rate": 6.601423487544484e-07, + "loss": 0.9228, + "mean_token_accuracy": 0.7222405523061752, + "num_tokens": 107015770.0, + "step": 372 + }, + { + "epoch": 0.13285841495992876, + "grad_norm": 0.6833318471908569, + "learning_rate": 6.619217081850533e-07, + "loss": 0.8276, + "mean_token_accuracy": 0.7471820265054703, + "num_tokens": 107327908.0, + "step": 373 + }, + { + "epoch": 0.1332146037399822, + "grad_norm": 0.6777209639549255, + "learning_rate": 6.637010676156583e-07, + "loss": 0.7969, + "mean_token_accuracy": 0.7510609328746796, + "num_tokens": 107639821.0, + "step": 374 + }, + { + "epoch": 0.13357079252003562, + "grad_norm": 0.6895166039466858, + "learning_rate": 6.654804270462633e-07, + "loss": 0.8683, + "mean_token_accuracy": 0.7395746409893036, + "num_tokens": 107931614.0, + "step": 375 + }, + { + "epoch": 0.13392698130008904, + "grad_norm": 0.7018929123878479, + "learning_rate": 6.672597864768683e-07, + "loss": 0.8493, + "mean_token_accuracy": 0.7469554394483566, + "num_tokens": 108205189.0, + "step": 376 + }, + { + "epoch": 0.13428317008014248, + "grad_norm": 0.7448248267173767, + "learning_rate": 6.690391459074733e-07, + "loss": 0.9026, + "mean_token_accuracy": 0.7230657935142517, + "num_tokens": 108492175.0, + "step": 377 + }, + { + "epoch": 0.1346393588601959, + "grad_norm": 0.7616023421287537, + "learning_rate": 6.708185053380783e-07, + "loss": 0.8561, + "mean_token_accuracy": 0.7335499376058578, + "num_tokens": 108788794.0, + "step": 378 + }, + { + "epoch": 0.13499554764024932, + "grad_norm": 0.7541584372520447, + "learning_rate": 6.725978647686833e-07, + "loss": 0.8597, + "mean_token_accuracy": 0.742985874414444, + "num_tokens": 109071074.0, + "step": 379 + }, + { + "epoch": 0.13535173642030277, + "grad_norm": 0.7176157832145691, + "learning_rate": 6.743772241992882e-07, + "loss": 0.8272, + "mean_token_accuracy": 0.7489974647760391, + "num_tokens": 109342350.0, + "step": 380 + }, + { + "epoch": 0.13570792520035618, + "grad_norm": 0.777927041053772, + "learning_rate": 6.761565836298932e-07, + "loss": 0.8011, + "mean_token_accuracy": 0.7531100511550903, + "num_tokens": 109591817.0, + "step": 381 + }, + { + "epoch": 0.13606411398040963, + "grad_norm": 0.7579470872879028, + "learning_rate": 6.779359430604982e-07, + "loss": 0.8173, + "mean_token_accuracy": 0.747642919421196, + "num_tokens": 109873174.0, + "step": 382 + }, + { + "epoch": 0.13642030276046305, + "grad_norm": 0.7136251926422119, + "learning_rate": 6.797153024911032e-07, + "loss": 0.8832, + "mean_token_accuracy": 0.7338443100452423, + "num_tokens": 110152038.0, + "step": 383 + }, + { + "epoch": 0.13677649154051646, + "grad_norm": 0.8517240285873413, + "learning_rate": 6.814946619217081e-07, + "loss": 0.8609, + "mean_token_accuracy": 0.7381782084703445, + "num_tokens": 110381941.0, + "step": 384 + }, + { + "epoch": 0.1371326803205699, + "grad_norm": 0.7478604912757874, + "learning_rate": 6.832740213523132e-07, + "loss": 0.8483, + "mean_token_accuracy": 0.7415430396795273, + "num_tokens": 110648234.0, + "step": 385 + }, + { + "epoch": 0.13748886910062333, + "grad_norm": 0.7780990600585938, + "learning_rate": 6.85053380782918e-07, + "loss": 0.8237, + "mean_token_accuracy": 0.7452669441699982, + "num_tokens": 110928518.0, + "step": 386 + }, + { + "epoch": 0.13784505788067675, + "grad_norm": 0.7106236815452576, + "learning_rate": 6.868327402135231e-07, + "loss": 0.746, + "mean_token_accuracy": 0.7700026482343674, + "num_tokens": 111217681.0, + "step": 387 + }, + { + "epoch": 0.1382012466607302, + "grad_norm": 0.7054415345191956, + "learning_rate": 6.88612099644128e-07, + "loss": 0.8548, + "mean_token_accuracy": 0.7414819598197937, + "num_tokens": 111495692.0, + "step": 388 + }, + { + "epoch": 0.1385574354407836, + "grad_norm": 0.6984289884567261, + "learning_rate": 6.903914590747331e-07, + "loss": 0.754, + "mean_token_accuracy": 0.7694248855113983, + "num_tokens": 111780667.0, + "step": 389 + }, + { + "epoch": 0.13891362422083706, + "grad_norm": 0.7132795453071594, + "learning_rate": 6.921708185053381e-07, + "loss": 0.8493, + "mean_token_accuracy": 0.744401216506958, + "num_tokens": 112080501.0, + "step": 390 + }, + { + "epoch": 0.13926981300089047, + "grad_norm": 0.6970193982124329, + "learning_rate": 6.93950177935943e-07, + "loss": 0.7758, + "mean_token_accuracy": 0.7618411928415298, + "num_tokens": 112367242.0, + "step": 391 + }, + { + "epoch": 0.1396260017809439, + "grad_norm": 0.7112011909484863, + "learning_rate": 6.957295373665481e-07, + "loss": 0.7982, + "mean_token_accuracy": 0.7534142732620239, + "num_tokens": 112664152.0, + "step": 392 + }, + { + "epoch": 0.13998219056099734, + "grad_norm": 0.7215514779090881, + "learning_rate": 6.975088967971529e-07, + "loss": 0.8108, + "mean_token_accuracy": 0.7546986639499664, + "num_tokens": 112935586.0, + "step": 393 + }, + { + "epoch": 0.14033837934105076, + "grad_norm": 0.6978161931037903, + "learning_rate": 6.99288256227758e-07, + "loss": 0.7993, + "mean_token_accuracy": 0.7498926967382431, + "num_tokens": 113220795.0, + "step": 394 + }, + { + "epoch": 0.14069456812110417, + "grad_norm": 0.692151665687561, + "learning_rate": 7.010676156583629e-07, + "loss": 0.8031, + "mean_token_accuracy": 0.7516372501850128, + "num_tokens": 113530601.0, + "step": 395 + }, + { + "epoch": 0.14105075690115762, + "grad_norm": 0.6939737796783447, + "learning_rate": 7.028469750889679e-07, + "loss": 0.8498, + "mean_token_accuracy": 0.7439578622579575, + "num_tokens": 113819191.0, + "step": 396 + }, + { + "epoch": 0.14140694568121104, + "grad_norm": 0.7364786863327026, + "learning_rate": 7.046263345195729e-07, + "loss": 0.8754, + "mean_token_accuracy": 0.7313510179519653, + "num_tokens": 114101038.0, + "step": 397 + }, + { + "epoch": 0.14176313446126448, + "grad_norm": 0.6768140196800232, + "learning_rate": 7.064056939501779e-07, + "loss": 0.8562, + "mean_token_accuracy": 0.7335629761219025, + "num_tokens": 114397369.0, + "step": 398 + }, + { + "epoch": 0.1421193232413179, + "grad_norm": 0.6486372947692871, + "learning_rate": 7.08185053380783e-07, + "loss": 0.8612, + "mean_token_accuracy": 0.7417990118265152, + "num_tokens": 114720679.0, + "step": 399 + }, + { + "epoch": 0.14247551202137132, + "grad_norm": 0.7563493847846985, + "learning_rate": 7.099644128113878e-07, + "loss": 0.85, + "mean_token_accuracy": 0.7407055348157883, + "num_tokens": 114990095.0, + "step": 400 + }, + { + "epoch": 0.14283170080142477, + "grad_norm": 0.7130924463272095, + "learning_rate": 7.117437722419929e-07, + "loss": 0.8737, + "mean_token_accuracy": 0.7348067164421082, + "num_tokens": 115294182.0, + "step": 401 + }, + { + "epoch": 0.14318788958147818, + "grad_norm": 0.743617057800293, + "learning_rate": 7.135231316725978e-07, + "loss": 0.8296, + "mean_token_accuracy": 0.7393983006477356, + "num_tokens": 115561286.0, + "step": 402 + }, + { + "epoch": 0.1435440783615316, + "grad_norm": 0.7125967741012573, + "learning_rate": 7.153024911032028e-07, + "loss": 0.8895, + "mean_token_accuracy": 0.7351196259260178, + "num_tokens": 115847200.0, + "step": 403 + }, + { + "epoch": 0.14390026714158505, + "grad_norm": 0.7150217890739441, + "learning_rate": 7.170818505338078e-07, + "loss": 0.8895, + "mean_token_accuracy": 0.7334041446447372, + "num_tokens": 116129989.0, + "step": 404 + }, + { + "epoch": 0.14425645592163847, + "grad_norm": 0.7810922861099243, + "learning_rate": 7.188612099644128e-07, + "loss": 0.7942, + "mean_token_accuracy": 0.7530809044837952, + "num_tokens": 116382659.0, + "step": 405 + }, + { + "epoch": 0.14461264470169188, + "grad_norm": 0.755226194858551, + "learning_rate": 7.206405693950178e-07, + "loss": 0.8629, + "mean_token_accuracy": 0.733656570315361, + "num_tokens": 116655890.0, + "step": 406 + }, + { + "epoch": 0.14496883348174533, + "grad_norm": 0.6858718991279602, + "learning_rate": 7.224199288256227e-07, + "loss": 0.8779, + "mean_token_accuracy": 0.7331413775682449, + "num_tokens": 116953147.0, + "step": 407 + }, + { + "epoch": 0.14532502226179875, + "grad_norm": 0.7195025682449341, + "learning_rate": 7.241992882562277e-07, + "loss": 0.8504, + "mean_token_accuracy": 0.7422638237476349, + "num_tokens": 117239577.0, + "step": 408 + }, + { + "epoch": 0.1456812110418522, + "grad_norm": 0.6774147152900696, + "learning_rate": 7.259786476868327e-07, + "loss": 0.7794, + "mean_token_accuracy": 0.7631754726171494, + "num_tokens": 117545012.0, + "step": 409 + }, + { + "epoch": 0.1460373998219056, + "grad_norm": 0.7014650702476501, + "learning_rate": 7.277580071174377e-07, + "loss": 0.8226, + "mean_token_accuracy": 0.7490649223327637, + "num_tokens": 117857877.0, + "step": 410 + }, + { + "epoch": 0.14639358860195903, + "grad_norm": 0.6838301420211792, + "learning_rate": 7.295373665480427e-07, + "loss": 0.837, + "mean_token_accuracy": 0.7483471482992172, + "num_tokens": 118150413.0, + "step": 411 + }, + { + "epoch": 0.14674977738201248, + "grad_norm": 0.6844708919525146, + "learning_rate": 7.313167259786477e-07, + "loss": 0.8007, + "mean_token_accuracy": 0.7560314387083054, + "num_tokens": 118447078.0, + "step": 412 + }, + { + "epoch": 0.1471059661620659, + "grad_norm": 0.7098121047019958, + "learning_rate": 7.330960854092527e-07, + "loss": 0.8855, + "mean_token_accuracy": 0.7301998734474182, + "num_tokens": 118760503.0, + "step": 413 + }, + { + "epoch": 0.1474621549421193, + "grad_norm": 0.7041444182395935, + "learning_rate": 7.348754448398576e-07, + "loss": 0.8442, + "mean_token_accuracy": 0.7374758273363113, + "num_tokens": 119077849.0, + "step": 414 + }, + { + "epoch": 0.14781834372217276, + "grad_norm": 0.6719407439231873, + "learning_rate": 7.366548042704625e-07, + "loss": 0.9088, + "mean_token_accuracy": 0.7229574620723724, + "num_tokens": 119425179.0, + "step": 415 + }, + { + "epoch": 0.14817453250222618, + "grad_norm": 0.7520114183425903, + "learning_rate": 7.384341637010676e-07, + "loss": 0.7955, + "mean_token_accuracy": 0.7496415972709656, + "num_tokens": 119680809.0, + "step": 416 + }, + { + "epoch": 0.14853072128227962, + "grad_norm": 0.6884881854057312, + "learning_rate": 7.402135231316725e-07, + "loss": 0.8242, + "mean_token_accuracy": 0.7454087734222412, + "num_tokens": 119982629.0, + "step": 417 + }, + { + "epoch": 0.14888691006233304, + "grad_norm": 0.7593318819999695, + "learning_rate": 7.419928825622776e-07, + "loss": 0.872, + "mean_token_accuracy": 0.739609882235527, + "num_tokens": 120250329.0, + "step": 418 + }, + { + "epoch": 0.14924309884238646, + "grad_norm": 0.7197626829147339, + "learning_rate": 7.437722419928826e-07, + "loss": 0.8425, + "mean_token_accuracy": 0.7466948479413986, + "num_tokens": 120560152.0, + "step": 419 + }, + { + "epoch": 0.1495992876224399, + "grad_norm": 0.6821435689926147, + "learning_rate": 7.455516014234874e-07, + "loss": 0.8318, + "mean_token_accuracy": 0.7437402606010437, + "num_tokens": 120862977.0, + "step": 420 + }, + { + "epoch": 0.14995547640249332, + "grad_norm": 0.7021756172180176, + "learning_rate": 7.473309608540925e-07, + "loss": 0.8469, + "mean_token_accuracy": 0.7388840764760971, + "num_tokens": 121155464.0, + "step": 421 + }, + { + "epoch": 0.15031166518254674, + "grad_norm": 0.7483583688735962, + "learning_rate": 7.491103202846974e-07, + "loss": 0.8762, + "mean_token_accuracy": 0.7334829568862915, + "num_tokens": 121449585.0, + "step": 422 + }, + { + "epoch": 0.15066785396260018, + "grad_norm": 0.7078438997268677, + "learning_rate": 7.508896797153025e-07, + "loss": 0.818, + "mean_token_accuracy": 0.7488051801919937, + "num_tokens": 121761395.0, + "step": 423 + }, + { + "epoch": 0.1510240427426536, + "grad_norm": 0.6790430545806885, + "learning_rate": 7.526690391459074e-07, + "loss": 0.7859, + "mean_token_accuracy": 0.7521216869354248, + "num_tokens": 122080011.0, + "step": 424 + }, + { + "epoch": 0.15138023152270705, + "grad_norm": 0.6783570051193237, + "learning_rate": 7.544483985765125e-07, + "loss": 0.8519, + "mean_token_accuracy": 0.7484552264213562, + "num_tokens": 122385411.0, + "step": 425 + }, + { + "epoch": 0.15173642030276047, + "grad_norm": 0.7523171901702881, + "learning_rate": 7.562277580071174e-07, + "loss": 0.8374, + "mean_token_accuracy": 0.7449071705341339, + "num_tokens": 122656393.0, + "step": 426 + }, + { + "epoch": 0.15209260908281388, + "grad_norm": 0.7148206830024719, + "learning_rate": 7.580071174377223e-07, + "loss": 0.7708, + "mean_token_accuracy": 0.7584453374147415, + "num_tokens": 122949615.0, + "step": 427 + }, + { + "epoch": 0.15244879786286733, + "grad_norm": 0.7667486071586609, + "learning_rate": 7.597864768683274e-07, + "loss": 0.9141, + "mean_token_accuracy": 0.7247387170791626, + "num_tokens": 123212815.0, + "step": 428 + }, + { + "epoch": 0.15280498664292075, + "grad_norm": 0.7049400210380554, + "learning_rate": 7.615658362989323e-07, + "loss": 0.8394, + "mean_token_accuracy": 0.7432080656290054, + "num_tokens": 123509811.0, + "step": 429 + }, + { + "epoch": 0.15316117542297417, + "grad_norm": 0.6730336546897888, + "learning_rate": 7.633451957295374e-07, + "loss": 0.8449, + "mean_token_accuracy": 0.7406093627214432, + "num_tokens": 123827346.0, + "step": 430 + }, + { + "epoch": 0.1535173642030276, + "grad_norm": 0.7004762887954712, + "learning_rate": 7.651245551601423e-07, + "loss": 0.9332, + "mean_token_accuracy": 0.71953384578228, + "num_tokens": 124134791.0, + "step": 431 + }, + { + "epoch": 0.15387355298308103, + "grad_norm": 0.7417648434638977, + "learning_rate": 7.669039145907473e-07, + "loss": 0.8757, + "mean_token_accuracy": 0.7329484820365906, + "num_tokens": 124425281.0, + "step": 432 + }, + { + "epoch": 0.15422974176313445, + "grad_norm": 0.6902212500572205, + "learning_rate": 7.686832740213523e-07, + "loss": 0.8293, + "mean_token_accuracy": 0.7435656487941742, + "num_tokens": 124722482.0, + "step": 433 + }, + { + "epoch": 0.1545859305431879, + "grad_norm": 0.713404655456543, + "learning_rate": 7.704626334519572e-07, + "loss": 0.8599, + "mean_token_accuracy": 0.7394591867923737, + "num_tokens": 125010956.0, + "step": 434 + }, + { + "epoch": 0.1549421193232413, + "grad_norm": 0.7230011224746704, + "learning_rate": 7.722419928825622e-07, + "loss": 0.8532, + "mean_token_accuracy": 0.7412800937891006, + "num_tokens": 125324107.0, + "step": 435 + }, + { + "epoch": 0.15529830810329476, + "grad_norm": 0.73555588722229, + "learning_rate": 7.740213523131672e-07, + "loss": 0.8546, + "mean_token_accuracy": 0.7390526980161667, + "num_tokens": 125605197.0, + "step": 436 + }, + { + "epoch": 0.15565449688334818, + "grad_norm": 0.6673810482025146, + "learning_rate": 7.758007117437722e-07, + "loss": 0.8327, + "mean_token_accuracy": 0.7448458671569824, + "num_tokens": 125929074.0, + "step": 437 + }, + { + "epoch": 0.1560106856634016, + "grad_norm": 0.6891106963157654, + "learning_rate": 7.775800711743772e-07, + "loss": 0.8305, + "mean_token_accuracy": 0.7460180371999741, + "num_tokens": 126209880.0, + "step": 438 + }, + { + "epoch": 0.15636687444345504, + "grad_norm": 0.7216120958328247, + "learning_rate": 7.793594306049822e-07, + "loss": 0.7592, + "mean_token_accuracy": 0.7606547623872757, + "num_tokens": 126503810.0, + "step": 439 + }, + { + "epoch": 0.15672306322350846, + "grad_norm": 0.6703522801399231, + "learning_rate": 7.811387900355872e-07, + "loss": 0.7971, + "mean_token_accuracy": 0.7514527440071106, + "num_tokens": 126806452.0, + "step": 440 + }, + { + "epoch": 0.15707925200356188, + "grad_norm": 0.7384240031242371, + "learning_rate": 7.829181494661921e-07, + "loss": 0.7413, + "mean_token_accuracy": 0.7652167975902557, + "num_tokens": 127085224.0, + "step": 441 + }, + { + "epoch": 0.15743544078361532, + "grad_norm": 0.7478107213973999, + "learning_rate": 7.846975088967971e-07, + "loss": 0.8646, + "mean_token_accuracy": 0.735996276140213, + "num_tokens": 127365269.0, + "step": 442 + }, + { + "epoch": 0.15779162956366874, + "grad_norm": 0.6904101371765137, + "learning_rate": 7.864768683274021e-07, + "loss": 0.8282, + "mean_token_accuracy": 0.7495797574520111, + "num_tokens": 127674139.0, + "step": 443 + }, + { + "epoch": 0.15814781834372219, + "grad_norm": 0.738361656665802, + "learning_rate": 7.88256227758007e-07, + "loss": 0.8504, + "mean_token_accuracy": 0.7432016581296921, + "num_tokens": 127937445.0, + "step": 444 + }, + { + "epoch": 0.1585040071237756, + "grad_norm": 0.6644251346588135, + "learning_rate": 7.900355871886121e-07, + "loss": 0.8329, + "mean_token_accuracy": 0.7405498772859573, + "num_tokens": 128227425.0, + "step": 445 + }, + { + "epoch": 0.15886019590382902, + "grad_norm": 0.7221015691757202, + "learning_rate": 7.91814946619217e-07, + "loss": 0.8092, + "mean_token_accuracy": 0.7509548515081406, + "num_tokens": 128489440.0, + "step": 446 + }, + { + "epoch": 0.15921638468388247, + "grad_norm": 0.6987266540527344, + "learning_rate": 7.935943060498221e-07, + "loss": 0.7945, + "mean_token_accuracy": 0.7563659250736237, + "num_tokens": 128770126.0, + "step": 447 + }, + { + "epoch": 0.15957257346393589, + "grad_norm": 0.7285388708114624, + "learning_rate": 7.95373665480427e-07, + "loss": 0.7373, + "mean_token_accuracy": 0.7693547606468201, + "num_tokens": 129055664.0, + "step": 448 + }, + { + "epoch": 0.1599287622439893, + "grad_norm": 0.7699159383773804, + "learning_rate": 7.97153024911032e-07, + "loss": 0.8013, + "mean_token_accuracy": 0.7567782551050186, + "num_tokens": 129289469.0, + "step": 449 + }, + { + "epoch": 0.16028495102404275, + "grad_norm": 0.744652509689331, + "learning_rate": 7.98932384341637e-07, + "loss": 0.878, + "mean_token_accuracy": 0.7376460582017899, + "num_tokens": 129554018.0, + "step": 450 + }, + { + "epoch": 0.16064113980409617, + "grad_norm": 0.7665709853172302, + "learning_rate": 8.007117437722419e-07, + "loss": 0.8355, + "mean_token_accuracy": 0.740072712302208, + "num_tokens": 129803205.0, + "step": 451 + }, + { + "epoch": 0.1609973285841496, + "grad_norm": 0.7514992952346802, + "learning_rate": 8.02491103202847e-07, + "loss": 0.8907, + "mean_token_accuracy": 0.7270955443382263, + "num_tokens": 130042041.0, + "step": 452 + }, + { + "epoch": 0.16135351736420303, + "grad_norm": 0.6889475584030151, + "learning_rate": 8.042704626334519e-07, + "loss": 0.8863, + "mean_token_accuracy": 0.735449954867363, + "num_tokens": 130347732.0, + "step": 453 + }, + { + "epoch": 0.16170970614425645, + "grad_norm": 0.7111026048660278, + "learning_rate": 8.06049822064057e-07, + "loss": 0.8799, + "mean_token_accuracy": 0.7325556725263596, + "num_tokens": 130630895.0, + "step": 454 + }, + { + "epoch": 0.1620658949243099, + "grad_norm": 0.7269096970558167, + "learning_rate": 8.078291814946618e-07, + "loss": 0.852, + "mean_token_accuracy": 0.7404086738824844, + "num_tokens": 130931556.0, + "step": 455 + }, + { + "epoch": 0.1624220837043633, + "grad_norm": 0.6558296084403992, + "learning_rate": 8.096085409252668e-07, + "loss": 0.8042, + "mean_token_accuracy": 0.7502182871103287, + "num_tokens": 131251734.0, + "step": 456 + }, + { + "epoch": 0.16277827248441673, + "grad_norm": 0.6479088068008423, + "learning_rate": 8.113879003558719e-07, + "loss": 0.8374, + "mean_token_accuracy": 0.7477431893348694, + "num_tokens": 131557393.0, + "step": 457 + }, + { + "epoch": 0.16313446126447018, + "grad_norm": 0.7851834893226624, + "learning_rate": 8.131672597864768e-07, + "loss": 0.8109, + "mean_token_accuracy": 0.7504305988550186, + "num_tokens": 131821738.0, + "step": 458 + }, + { + "epoch": 0.1634906500445236, + "grad_norm": 0.7029165625572205, + "learning_rate": 8.149466192170819e-07, + "loss": 0.8206, + "mean_token_accuracy": 0.7477936893701553, + "num_tokens": 132111905.0, + "step": 459 + }, + { + "epoch": 0.163846838824577, + "grad_norm": 0.7634729743003845, + "learning_rate": 8.167259786476868e-07, + "loss": 0.8279, + "mean_token_accuracy": 0.7510463446378708, + "num_tokens": 132365463.0, + "step": 460 + }, + { + "epoch": 0.16420302760463046, + "grad_norm": 0.7015594244003296, + "learning_rate": 8.185053380782919e-07, + "loss": 0.8511, + "mean_token_accuracy": 0.73646479845047, + "num_tokens": 132659478.0, + "step": 461 + }, + { + "epoch": 0.16455921638468388, + "grad_norm": 0.7920646071434021, + "learning_rate": 8.202846975088967e-07, + "loss": 0.8829, + "mean_token_accuracy": 0.72760309278965, + "num_tokens": 132935862.0, + "step": 462 + }, + { + "epoch": 0.16491540516473732, + "grad_norm": 0.8208740949630737, + "learning_rate": 8.220640569395017e-07, + "loss": 0.8385, + "mean_token_accuracy": 0.7369623184204102, + "num_tokens": 133163048.0, + "step": 463 + }, + { + "epoch": 0.16527159394479074, + "grad_norm": 0.6822824478149414, + "learning_rate": 8.238434163701067e-07, + "loss": 0.823, + "mean_token_accuracy": 0.744083046913147, + "num_tokens": 133460216.0, + "step": 464 + }, + { + "epoch": 0.16562778272484416, + "grad_norm": 0.7610927820205688, + "learning_rate": 8.256227758007117e-07, + "loss": 0.8359, + "mean_token_accuracy": 0.7494080066680908, + "num_tokens": 133756143.0, + "step": 465 + }, + { + "epoch": 0.1659839715048976, + "grad_norm": 0.6783084869384766, + "learning_rate": 8.274021352313167e-07, + "loss": 0.8103, + "mean_token_accuracy": 0.7523167431354523, + "num_tokens": 134052679.0, + "step": 466 + }, + { + "epoch": 0.16634016028495102, + "grad_norm": 0.716691255569458, + "learning_rate": 8.291814946619217e-07, + "loss": 0.7878, + "mean_token_accuracy": 0.7542453855276108, + "num_tokens": 134325179.0, + "step": 467 + }, + { + "epoch": 0.16669634906500444, + "grad_norm": 0.7494585514068604, + "learning_rate": 8.309608540925266e-07, + "loss": 0.7701, + "mean_token_accuracy": 0.7569205462932587, + "num_tokens": 134580163.0, + "step": 468 + }, + { + "epoch": 0.16705253784505789, + "grad_norm": 0.7485362887382507, + "learning_rate": 8.327402135231316e-07, + "loss": 0.9641, + "mean_token_accuracy": 0.7121927440166473, + "num_tokens": 134877639.0, + "step": 469 + }, + { + "epoch": 0.1674087266251113, + "grad_norm": 0.6968991756439209, + "learning_rate": 8.345195729537366e-07, + "loss": 0.9161, + "mean_token_accuracy": 0.7249750345945358, + "num_tokens": 135189594.0, + "step": 470 + }, + { + "epoch": 0.16776491540516475, + "grad_norm": 0.751426637172699, + "learning_rate": 8.362989323843416e-07, + "loss": 0.8567, + "mean_token_accuracy": 0.7444113790988922, + "num_tokens": 135474813.0, + "step": 471 + }, + { + "epoch": 0.16812110418521817, + "grad_norm": 0.7233130931854248, + "learning_rate": 8.380782918149466e-07, + "loss": 0.9173, + "mean_token_accuracy": 0.7257480621337891, + "num_tokens": 135762082.0, + "step": 472 + }, + { + "epoch": 0.16847729296527159, + "grad_norm": 0.6874028444290161, + "learning_rate": 8.398576512455516e-07, + "loss": 0.7452, + "mean_token_accuracy": 0.7691357284784317, + "num_tokens": 136058823.0, + "step": 473 + }, + { + "epoch": 0.16883348174532503, + "grad_norm": 0.6791144609451294, + "learning_rate": 8.416370106761566e-07, + "loss": 0.723, + "mean_token_accuracy": 0.7705571055412292, + "num_tokens": 136366250.0, + "step": 474 + }, + { + "epoch": 0.16918967052537845, + "grad_norm": 0.8315048813819885, + "learning_rate": 8.434163701067614e-07, + "loss": 0.8722, + "mean_token_accuracy": 0.7310643494129181, + "num_tokens": 136584212.0, + "step": 475 + }, + { + "epoch": 0.16954585930543187, + "grad_norm": 0.7128815650939941, + "learning_rate": 8.451957295373665e-07, + "loss": 0.8615, + "mean_token_accuracy": 0.7367059141397476, + "num_tokens": 136863721.0, + "step": 476 + }, + { + "epoch": 0.1699020480854853, + "grad_norm": 0.6683933138847351, + "learning_rate": 8.469750889679715e-07, + "loss": 0.8792, + "mean_token_accuracy": 0.734981894493103, + "num_tokens": 137179369.0, + "step": 477 + }, + { + "epoch": 0.17025823686553873, + "grad_norm": 0.7099683284759521, + "learning_rate": 8.487544483985765e-07, + "loss": 0.8356, + "mean_token_accuracy": 0.7456011921167374, + "num_tokens": 137453153.0, + "step": 478 + }, + { + "epoch": 0.17061442564559218, + "grad_norm": 0.6877455711364746, + "learning_rate": 8.505338078291815e-07, + "loss": 0.8819, + "mean_token_accuracy": 0.7345184832811356, + "num_tokens": 137761236.0, + "step": 479 + }, + { + "epoch": 0.1709706144256456, + "grad_norm": 0.7454991340637207, + "learning_rate": 8.523131672597864e-07, + "loss": 0.8347, + "mean_token_accuracy": 0.7419236749410629, + "num_tokens": 138057296.0, + "step": 480 + }, + { + "epoch": 0.171326803205699, + "grad_norm": 0.6910465359687805, + "learning_rate": 8.540925266903915e-07, + "loss": 0.8788, + "mean_token_accuracy": 0.733534187078476, + "num_tokens": 138364062.0, + "step": 481 + }, + { + "epoch": 0.17168299198575246, + "grad_norm": 0.7146512866020203, + "learning_rate": 8.558718861209963e-07, + "loss": 0.8016, + "mean_token_accuracy": 0.7543796300888062, + "num_tokens": 138647689.0, + "step": 482 + }, + { + "epoch": 0.17203918076580588, + "grad_norm": 0.7160322070121765, + "learning_rate": 8.576512455516014e-07, + "loss": 0.803, + "mean_token_accuracy": 0.748015895485878, + "num_tokens": 138934306.0, + "step": 483 + }, + { + "epoch": 0.1723953695458593, + "grad_norm": 0.6663281321525574, + "learning_rate": 8.594306049822063e-07, + "loss": 0.7923, + "mean_token_accuracy": 0.7592195272445679, + "num_tokens": 139239617.0, + "step": 484 + }, + { + "epoch": 0.17275155832591274, + "grad_norm": 0.7535040974617004, + "learning_rate": 8.612099644128114e-07, + "loss": 0.8495, + "mean_token_accuracy": 0.7355668693780899, + "num_tokens": 139498119.0, + "step": 485 + }, + { + "epoch": 0.17310774710596616, + "grad_norm": 0.7289955019950867, + "learning_rate": 8.629893238434164e-07, + "loss": 0.927, + "mean_token_accuracy": 0.7195906639099121, + "num_tokens": 139779697.0, + "step": 486 + }, + { + "epoch": 0.17346393588601958, + "grad_norm": 0.6869931221008301, + "learning_rate": 8.647686832740213e-07, + "loss": 0.8097, + "mean_token_accuracy": 0.7518081665039062, + "num_tokens": 140070813.0, + "step": 487 + }, + { + "epoch": 0.17382012466607302, + "grad_norm": 0.7524558901786804, + "learning_rate": 8.665480427046264e-07, + "loss": 0.8395, + "mean_token_accuracy": 0.7411208152770996, + "num_tokens": 140332553.0, + "step": 488 + }, + { + "epoch": 0.17417631344612644, + "grad_norm": 0.7287404537200928, + "learning_rate": 8.683274021352312e-07, + "loss": 0.8787, + "mean_token_accuracy": 0.7352984547615051, + "num_tokens": 140620568.0, + "step": 489 + }, + { + "epoch": 0.1745325022261799, + "grad_norm": 0.730720579624176, + "learning_rate": 8.701067615658363e-07, + "loss": 0.7818, + "mean_token_accuracy": 0.7522129714488983, + "num_tokens": 140890399.0, + "step": 490 + }, + { + "epoch": 0.1748886910062333, + "grad_norm": 0.7387116551399231, + "learning_rate": 8.718861209964412e-07, + "loss": 0.9128, + "mean_token_accuracy": 0.7219896167516708, + "num_tokens": 141170222.0, + "step": 491 + }, + { + "epoch": 0.17524487978628672, + "grad_norm": 0.704042911529541, + "learning_rate": 8.736654804270462e-07, + "loss": 0.8752, + "mean_token_accuracy": 0.7298970818519592, + "num_tokens": 141471261.0, + "step": 492 + }, + { + "epoch": 0.17560106856634017, + "grad_norm": 0.6757092475891113, + "learning_rate": 8.754448398576512e-07, + "loss": 0.8646, + "mean_token_accuracy": 0.7365240156650543, + "num_tokens": 141780492.0, + "step": 493 + }, + { + "epoch": 0.1759572573463936, + "grad_norm": 0.7024980783462524, + "learning_rate": 8.772241992882562e-07, + "loss": 0.8411, + "mean_token_accuracy": 0.7449780255556107, + "num_tokens": 142078358.0, + "step": 494 + }, + { + "epoch": 0.176313446126447, + "grad_norm": 0.7204870581626892, + "learning_rate": 8.790035587188612e-07, + "loss": 0.8304, + "mean_token_accuracy": 0.7435835748910904, + "num_tokens": 142364256.0, + "step": 495 + }, + { + "epoch": 0.17666963490650045, + "grad_norm": 0.7250639796257019, + "learning_rate": 8.807829181494661e-07, + "loss": 0.8481, + "mean_token_accuracy": 0.7360876649618149, + "num_tokens": 142646241.0, + "step": 496 + }, + { + "epoch": 0.17702582368655387, + "grad_norm": 0.6749024391174316, + "learning_rate": 8.825622775800712e-07, + "loss": 0.7798, + "mean_token_accuracy": 0.7570681124925613, + "num_tokens": 142950902.0, + "step": 497 + }, + { + "epoch": 0.17738201246660731, + "grad_norm": 0.7646618485450745, + "learning_rate": 8.843416370106761e-07, + "loss": 0.8489, + "mean_token_accuracy": 0.7393633872270584, + "num_tokens": 143230707.0, + "step": 498 + }, + { + "epoch": 0.17773820124666073, + "grad_norm": 0.7230663895606995, + "learning_rate": 8.861209964412811e-07, + "loss": 0.7791, + "mean_token_accuracy": 0.7579367756843567, + "num_tokens": 143518624.0, + "step": 499 + }, + { + "epoch": 0.17809439002671415, + "grad_norm": 0.7295573353767395, + "learning_rate": 8.879003558718861e-07, + "loss": 0.9082, + "mean_token_accuracy": 0.7308065742254257, + "num_tokens": 143807512.0, + "step": 500 + }, + { + "epoch": 0.1784505788067676, + "grad_norm": 0.7258682250976562, + "learning_rate": 8.896797153024911e-07, + "loss": 0.7737, + "mean_token_accuracy": 0.7561025172472, + "num_tokens": 144107441.0, + "step": 501 + }, + { + "epoch": 0.178806767586821, + "grad_norm": 0.7349066138267517, + "learning_rate": 8.91459074733096e-07, + "loss": 0.8106, + "mean_token_accuracy": 0.7555597424507141, + "num_tokens": 144374216.0, + "step": 502 + }, + { + "epoch": 0.17916295636687443, + "grad_norm": 0.6702461242675781, + "learning_rate": 8.93238434163701e-07, + "loss": 0.7814, + "mean_token_accuracy": 0.7560131996870041, + "num_tokens": 144671189.0, + "step": 503 + }, + { + "epoch": 0.17951914514692788, + "grad_norm": 0.6994695663452148, + "learning_rate": 8.950177935943059e-07, + "loss": 0.7961, + "mean_token_accuracy": 0.752513125538826, + "num_tokens": 144960105.0, + "step": 504 + }, + { + "epoch": 0.1798753339269813, + "grad_norm": 0.6604408025741577, + "learning_rate": 8.96797153024911e-07, + "loss": 0.8648, + "mean_token_accuracy": 0.7343113571405411, + "num_tokens": 145304365.0, + "step": 505 + }, + { + "epoch": 0.18023152270703474, + "grad_norm": 0.690041184425354, + "learning_rate": 8.98576512455516e-07, + "loss": 0.8151, + "mean_token_accuracy": 0.7458937615156174, + "num_tokens": 145611422.0, + "step": 506 + }, + { + "epoch": 0.18058771148708816, + "grad_norm": 0.695729672908783, + "learning_rate": 9.00355871886121e-07, + "loss": 0.819, + "mean_token_accuracy": 0.7473014742136002, + "num_tokens": 145904718.0, + "step": 507 + }, + { + "epoch": 0.18094390026714158, + "grad_norm": 0.6957570910453796, + "learning_rate": 9.02135231316726e-07, + "loss": 0.7596, + "mean_token_accuracy": 0.763221949338913, + "num_tokens": 146211896.0, + "step": 508 + }, + { + "epoch": 0.18130008904719502, + "grad_norm": 0.7394476532936096, + "learning_rate": 9.03914590747331e-07, + "loss": 0.8443, + "mean_token_accuracy": 0.7427958250045776, + "num_tokens": 146495661.0, + "step": 509 + }, + { + "epoch": 0.18165627782724844, + "grad_norm": 0.6860657334327698, + "learning_rate": 9.056939501779359e-07, + "loss": 0.899, + "mean_token_accuracy": 0.7314193546772003, + "num_tokens": 146781515.0, + "step": 510 + }, + { + "epoch": 0.18201246660730186, + "grad_norm": 0.7697842717170715, + "learning_rate": 9.074733096085408e-07, + "loss": 0.8672, + "mean_token_accuracy": 0.7350073754787445, + "num_tokens": 147054125.0, + "step": 511 + }, + { + "epoch": 0.1823686553873553, + "grad_norm": 0.7585417032241821, + "learning_rate": 9.092526690391459e-07, + "loss": 0.8859, + "mean_token_accuracy": 0.7348307520151138, + "num_tokens": 147296894.0, + "step": 512 + }, + { + "epoch": 0.18272484416740872, + "grad_norm": 0.703740656375885, + "learning_rate": 9.110320284697508e-07, + "loss": 0.8294, + "mean_token_accuracy": 0.7436494678258896, + "num_tokens": 147578162.0, + "step": 513 + }, + { + "epoch": 0.18308103294746214, + "grad_norm": 0.6632955074310303, + "learning_rate": 9.128113879003559e-07, + "loss": 0.7529, + "mean_token_accuracy": 0.7655348479747772, + "num_tokens": 147872186.0, + "step": 514 + }, + { + "epoch": 0.1834372217275156, + "grad_norm": 0.6475892066955566, + "learning_rate": 9.145907473309609e-07, + "loss": 0.7657, + "mean_token_accuracy": 0.7618273943662643, + "num_tokens": 148201078.0, + "step": 515 + }, + { + "epoch": 0.183793410507569, + "grad_norm": 0.7517141699790955, + "learning_rate": 9.163701067615657e-07, + "loss": 0.8302, + "mean_token_accuracy": 0.7425542920827866, + "num_tokens": 148474526.0, + "step": 516 + }, + { + "epoch": 0.18414959928762245, + "grad_norm": 0.7266284823417664, + "learning_rate": 9.181494661921708e-07, + "loss": 0.7978, + "mean_token_accuracy": 0.7539047449827194, + "num_tokens": 148757944.0, + "step": 517 + }, + { + "epoch": 0.18450578806767587, + "grad_norm": 0.7007148265838623, + "learning_rate": 9.199288256227757e-07, + "loss": 0.8147, + "mean_token_accuracy": 0.7472696304321289, + "num_tokens": 149036407.0, + "step": 518 + }, + { + "epoch": 0.1848619768477293, + "grad_norm": 0.6903685927391052, + "learning_rate": 9.217081850533808e-07, + "loss": 0.8463, + "mean_token_accuracy": 0.7345957905054092, + "num_tokens": 149348067.0, + "step": 519 + }, + { + "epoch": 0.18521816562778273, + "grad_norm": 0.7007926106452942, + "learning_rate": 9.234875444839857e-07, + "loss": 0.8201, + "mean_token_accuracy": 0.7483717799186707, + "num_tokens": 149638089.0, + "step": 520 + }, + { + "epoch": 0.18557435440783615, + "grad_norm": 0.6817996501922607, + "learning_rate": 9.252669039145908e-07, + "loss": 0.7819, + "mean_token_accuracy": 0.7609364837408066, + "num_tokens": 149932825.0, + "step": 521 + }, + { + "epoch": 0.18593054318788957, + "grad_norm": 0.7501301169395447, + "learning_rate": 9.270462633451957e-07, + "loss": 0.7952, + "mean_token_accuracy": 0.7545637488365173, + "num_tokens": 150182676.0, + "step": 522 + }, + { + "epoch": 0.18628673196794301, + "grad_norm": 0.7006324529647827, + "learning_rate": 9.288256227758006e-07, + "loss": 0.7927, + "mean_token_accuracy": 0.7571704536676407, + "num_tokens": 150463545.0, + "step": 523 + }, + { + "epoch": 0.18664292074799643, + "grad_norm": 0.7028580904006958, + "learning_rate": 9.306049822064056e-07, + "loss": 0.8649, + "mean_token_accuracy": 0.7339386194944382, + "num_tokens": 150754353.0, + "step": 524 + }, + { + "epoch": 0.18699910952804988, + "grad_norm": 0.7568219304084778, + "learning_rate": 9.323843416370106e-07, + "loss": 0.8164, + "mean_token_accuracy": 0.7493298202753067, + "num_tokens": 151001961.0, + "step": 525 + }, + { + "epoch": 0.1873552983081033, + "grad_norm": 0.6515777111053467, + "learning_rate": 9.341637010676157e-07, + "loss": 0.7661, + "mean_token_accuracy": 0.7643761783838272, + "num_tokens": 151315761.0, + "step": 526 + }, + { + "epoch": 0.18771148708815671, + "grad_norm": 0.6830393671989441, + "learning_rate": 9.359430604982206e-07, + "loss": 0.7805, + "mean_token_accuracy": 0.7544074654579163, + "num_tokens": 151606158.0, + "step": 527 + }, + { + "epoch": 0.18806767586821016, + "grad_norm": 0.6813042759895325, + "learning_rate": 9.377224199288256e-07, + "loss": 0.8649, + "mean_token_accuracy": 0.7405010908842087, + "num_tokens": 151898585.0, + "step": 528 + }, + { + "epoch": 0.18842386464826358, + "grad_norm": 0.7003040313720703, + "learning_rate": 9.395017793594306e-07, + "loss": 0.844, + "mean_token_accuracy": 0.7333419770002365, + "num_tokens": 152176354.0, + "step": 529 + }, + { + "epoch": 0.188780053428317, + "grad_norm": 0.6409923434257507, + "learning_rate": 9.412811387900355e-07, + "loss": 0.7801, + "mean_token_accuracy": 0.755828246474266, + "num_tokens": 152508380.0, + "step": 530 + }, + { + "epoch": 0.18913624220837044, + "grad_norm": 0.6951688528060913, + "learning_rate": 9.430604982206405e-07, + "loss": 0.7981, + "mean_token_accuracy": 0.748407632112503, + "num_tokens": 152787596.0, + "step": 531 + }, + { + "epoch": 0.18949243098842386, + "grad_norm": 0.6717240214347839, + "learning_rate": 9.448398576512455e-07, + "loss": 0.8045, + "mean_token_accuracy": 0.7502069175243378, + "num_tokens": 153092997.0, + "step": 532 + }, + { + "epoch": 0.1898486197684773, + "grad_norm": 0.7237720489501953, + "learning_rate": 9.466192170818504e-07, + "loss": 0.8786, + "mean_token_accuracy": 0.7316673845052719, + "num_tokens": 153370190.0, + "step": 533 + }, + { + "epoch": 0.19020480854853072, + "grad_norm": 0.715050995349884, + "learning_rate": 9.483985765124555e-07, + "loss": 0.8002, + "mean_token_accuracy": 0.7487118542194366, + "num_tokens": 153665749.0, + "step": 534 + }, + { + "epoch": 0.19056099732858414, + "grad_norm": 0.7403811812400818, + "learning_rate": 9.501779359430605e-07, + "loss": 0.8192, + "mean_token_accuracy": 0.7437000274658203, + "num_tokens": 153943967.0, + "step": 535 + }, + { + "epoch": 0.1909171861086376, + "grad_norm": 0.6908077597618103, + "learning_rate": 9.519572953736655e-07, + "loss": 0.8577, + "mean_token_accuracy": 0.7384116947650909, + "num_tokens": 154252376.0, + "step": 536 + }, + { + "epoch": 0.191273374888691, + "grad_norm": 0.7068831920623779, + "learning_rate": 9.537366548042705e-07, + "loss": 0.834, + "mean_token_accuracy": 0.7410516440868378, + "num_tokens": 154529093.0, + "step": 537 + }, + { + "epoch": 0.19162956366874442, + "grad_norm": 0.6765683889389038, + "learning_rate": 9.555160142348753e-07, + "loss": 0.7582, + "mean_token_accuracy": 0.7646732032299042, + "num_tokens": 154825205.0, + "step": 538 + }, + { + "epoch": 0.19198575244879787, + "grad_norm": 0.6890894174575806, + "learning_rate": 9.572953736654805e-07, + "loss": 0.8504, + "mean_token_accuracy": 0.7421608567237854, + "num_tokens": 155103323.0, + "step": 539 + }, + { + "epoch": 0.1923419412288513, + "grad_norm": 0.679840624332428, + "learning_rate": 9.590747330960853e-07, + "loss": 0.8936, + "mean_token_accuracy": 0.7293635457754135, + "num_tokens": 155407943.0, + "step": 540 + }, + { + "epoch": 0.19269813000890473, + "grad_norm": 0.7742056846618652, + "learning_rate": 9.608540925266903e-07, + "loss": 0.8389, + "mean_token_accuracy": 0.7434422224760056, + "num_tokens": 155665300.0, + "step": 541 + }, + { + "epoch": 0.19305431878895815, + "grad_norm": 0.6904928684234619, + "learning_rate": 9.626334519572953e-07, + "loss": 0.7853, + "mean_token_accuracy": 0.7561953961849213, + "num_tokens": 155966757.0, + "step": 542 + }, + { + "epoch": 0.19341050756901157, + "grad_norm": 0.6958359479904175, + "learning_rate": 9.644128113879002e-07, + "loss": 0.9098, + "mean_token_accuracy": 0.7275102734565735, + "num_tokens": 156262134.0, + "step": 543 + }, + { + "epoch": 0.19376669634906502, + "grad_norm": 0.7123000621795654, + "learning_rate": 9.661921708185054e-07, + "loss": 0.9062, + "mean_token_accuracy": 0.7217912077903748, + "num_tokens": 156539012.0, + "step": 544 + }, + { + "epoch": 0.19412288512911843, + "grad_norm": 0.694889485836029, + "learning_rate": 9.679715302491102e-07, + "loss": 0.8412, + "mean_token_accuracy": 0.7453763633966446, + "num_tokens": 156823014.0, + "step": 545 + }, + { + "epoch": 0.19447907390917185, + "grad_norm": 0.7656348347663879, + "learning_rate": 9.697508896797152e-07, + "loss": 0.8654, + "mean_token_accuracy": 0.7394191920757294, + "num_tokens": 157088149.0, + "step": 546 + }, + { + "epoch": 0.1948352626892253, + "grad_norm": 0.7364482283592224, + "learning_rate": 9.715302491103202e-07, + "loss": 0.8542, + "mean_token_accuracy": 0.7390437871217728, + "num_tokens": 157358893.0, + "step": 547 + }, + { + "epoch": 0.19519145146927872, + "grad_norm": 0.6720536947250366, + "learning_rate": 9.733096085409252e-07, + "loss": 0.814, + "mean_token_accuracy": 0.7459864169359207, + "num_tokens": 157665762.0, + "step": 548 + }, + { + "epoch": 0.19554764024933213, + "grad_norm": 0.6932239532470703, + "learning_rate": 9.750889679715302e-07, + "loss": 0.8075, + "mean_token_accuracy": 0.7510308176279068, + "num_tokens": 157959721.0, + "step": 549 + }, + { + "epoch": 0.19590382902938558, + "grad_norm": 0.7044507265090942, + "learning_rate": 9.768683274021351e-07, + "loss": 0.8275, + "mean_token_accuracy": 0.7380219101905823, + "num_tokens": 158236880.0, + "step": 550 + }, + { + "epoch": 0.196260017809439, + "grad_norm": 0.666174054145813, + "learning_rate": 9.786476868327401e-07, + "loss": 0.7148, + "mean_token_accuracy": 0.7749602049589157, + "num_tokens": 158564596.0, + "step": 551 + }, + { + "epoch": 0.19661620658949244, + "grad_norm": 0.6891554594039917, + "learning_rate": 9.804270462633451e-07, + "loss": 0.7874, + "mean_token_accuracy": 0.755044087767601, + "num_tokens": 158877828.0, + "step": 552 + }, + { + "epoch": 0.19697239536954586, + "grad_norm": 0.7785463333129883, + "learning_rate": 9.8220640569395e-07, + "loss": 0.8294, + "mean_token_accuracy": 0.7412566095590591, + "num_tokens": 159128047.0, + "step": 553 + }, + { + "epoch": 0.19732858414959928, + "grad_norm": 0.7236807346343994, + "learning_rate": 9.83985765124555e-07, + "loss": 0.8422, + "mean_token_accuracy": 0.7415138930082321, + "num_tokens": 159397630.0, + "step": 554 + }, + { + "epoch": 0.19768477292965272, + "grad_norm": 0.7512716054916382, + "learning_rate": 9.8576512455516e-07, + "loss": 0.8683, + "mean_token_accuracy": 0.7358522266149521, + "num_tokens": 159693525.0, + "step": 555 + }, + { + "epoch": 0.19804096170970614, + "grad_norm": 0.7258298993110657, + "learning_rate": 9.87544483985765e-07, + "loss": 0.8792, + "mean_token_accuracy": 0.7315467149019241, + "num_tokens": 159989319.0, + "step": 556 + }, + { + "epoch": 0.19839715048975956, + "grad_norm": 0.7171956300735474, + "learning_rate": 9.8932384341637e-07, + "loss": 0.8345, + "mean_token_accuracy": 0.7473087757825851, + "num_tokens": 160292688.0, + "step": 557 + }, + { + "epoch": 0.198753339269813, + "grad_norm": 0.6864299178123474, + "learning_rate": 9.91103202846975e-07, + "loss": 0.8017, + "mean_token_accuracy": 0.751194640994072, + "num_tokens": 160587055.0, + "step": 558 + }, + { + "epoch": 0.19910952804986642, + "grad_norm": 0.6579440832138062, + "learning_rate": 9.9288256227758e-07, + "loss": 0.8224, + "mean_token_accuracy": 0.7406383156776428, + "num_tokens": 160912558.0, + "step": 559 + }, + { + "epoch": 0.19946571682991987, + "grad_norm": 0.6899605393409729, + "learning_rate": 9.94661921708185e-07, + "loss": 0.8774, + "mean_token_accuracy": 0.7320640087127686, + "num_tokens": 161222970.0, + "step": 560 + }, + { + "epoch": 0.1998219056099733, + "grad_norm": 0.7040217518806458, + "learning_rate": 9.9644128113879e-07, + "loss": 0.9209, + "mean_token_accuracy": 0.7185641229152679, + "num_tokens": 161509485.0, + "step": 561 + }, + { + "epoch": 0.2001780943900267, + "grad_norm": 0.7186869382858276, + "learning_rate": 9.98220640569395e-07, + "loss": 0.7751, + "mean_token_accuracy": 0.7478864341974258, + "num_tokens": 161772306.0, + "step": 562 + }, + { + "epoch": 0.20053428317008015, + "grad_norm": 0.7288325428962708, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7314960956573486, + "num_tokens": 162050545.0, + "step": 563 + }, + { + "epoch": 0.20089047195013357, + "grad_norm": 0.6811203360557556, + "learning_rate": 1e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7475832253694534, + "num_tokens": 162386511.0, + "step": 564 + }, + { + "epoch": 0.201246660730187, + "grad_norm": 0.7107973098754883, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7479148805141449, + "num_tokens": 162665316.0, + "step": 565 + }, + { + "epoch": 0.20160284951024043, + "grad_norm": 0.7047806978225708, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7486565709114075, + "num_tokens": 162964474.0, + "step": 566 + }, + { + "epoch": 0.20195903829029385, + "grad_norm": 0.6627113819122314, + "learning_rate": 1e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.757062554359436, + "num_tokens": 163278066.0, + "step": 567 + }, + { + "epoch": 0.2023152270703473, + "grad_norm": 0.6983655095100403, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7399780303239822, + "num_tokens": 163594072.0, + "step": 568 + }, + { + "epoch": 0.20267141585040072, + "grad_norm": 0.7459298968315125, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7434957027435303, + "num_tokens": 163846751.0, + "step": 569 + }, + { + "epoch": 0.20302760463045413, + "grad_norm": 0.76077800989151, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7405428439378738, + "num_tokens": 164153228.0, + "step": 570 + }, + { + "epoch": 0.20338379341050758, + "grad_norm": 0.6868106722831726, + "learning_rate": 1e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.7482362687587738, + "num_tokens": 164443092.0, + "step": 571 + }, + { + "epoch": 0.203739982190561, + "grad_norm": 0.7055049538612366, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7445182353258133, + "num_tokens": 164727631.0, + "step": 572 + }, + { + "epoch": 0.20409617097061442, + "grad_norm": 0.7709288001060486, + "learning_rate": 1e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.7562787234783173, + "num_tokens": 165013963.0, + "step": 573 + }, + { + "epoch": 0.20445235975066786, + "grad_norm": 0.6959009766578674, + "learning_rate": 1e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7404976040124893, + "num_tokens": 165333713.0, + "step": 574 + }, + { + "epoch": 0.20480854853072128, + "grad_norm": 0.7296468615531921, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7478473484516144, + "num_tokens": 165595897.0, + "step": 575 + }, + { + "epoch": 0.2051647373107747, + "grad_norm": 0.7070809006690979, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7576356083154678, + "num_tokens": 165873131.0, + "step": 576 + }, + { + "epoch": 0.20552092609082814, + "grad_norm": 0.7648537158966064, + "learning_rate": 1e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7468665838241577, + "num_tokens": 166173359.0, + "step": 577 + }, + { + "epoch": 0.20587711487088156, + "grad_norm": 0.7052445411682129, + "learning_rate": 1e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7666405737400055, + "num_tokens": 166455195.0, + "step": 578 + }, + { + "epoch": 0.206233303650935, + "grad_norm": 0.7571142315864563, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.741596132516861, + "num_tokens": 166713948.0, + "step": 579 + }, + { + "epoch": 0.20658949243098843, + "grad_norm": 0.7309553027153015, + "learning_rate": 1e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7494176179170609, + "num_tokens": 166992068.0, + "step": 580 + }, + { + "epoch": 0.20694568121104184, + "grad_norm": 0.7132804989814758, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7377505749464035, + "num_tokens": 167303141.0, + "step": 581 + }, + { + "epoch": 0.2073018699910953, + "grad_norm": 0.6397101879119873, + "learning_rate": 1e-06, + "loss": 0.8016, + "mean_token_accuracy": 0.751934215426445, + "num_tokens": 167631786.0, + "step": 582 + }, + { + "epoch": 0.2076580587711487, + "grad_norm": 0.7800129055976868, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7442528158426285, + "num_tokens": 167890855.0, + "step": 583 + }, + { + "epoch": 0.20801424755120212, + "grad_norm": 0.7153105735778809, + "learning_rate": 1e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.7657853066921234, + "num_tokens": 168158478.0, + "step": 584 + }, + { + "epoch": 0.20837043633125557, + "grad_norm": 0.680581271648407, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.745169073343277, + "num_tokens": 168470998.0, + "step": 585 + }, + { + "epoch": 0.208726625111309, + "grad_norm": 0.6913917064666748, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7380895763635635, + "num_tokens": 168769264.0, + "step": 586 + }, + { + "epoch": 0.20908281389136243, + "grad_norm": 0.6641038060188293, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7678295224905014, + "num_tokens": 169090698.0, + "step": 587 + }, + { + "epoch": 0.20943900267141585, + "grad_norm": 0.6563974618911743, + "learning_rate": 1e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7502406090497971, + "num_tokens": 169394491.0, + "step": 588 + }, + { + "epoch": 0.20979519145146927, + "grad_norm": 0.6898133754730225, + "learning_rate": 1e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7439365684986115, + "num_tokens": 169676977.0, + "step": 589 + }, + { + "epoch": 0.21015138023152272, + "grad_norm": 0.7438639998435974, + "learning_rate": 1e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7586598694324493, + "num_tokens": 169938895.0, + "step": 590 + }, + { + "epoch": 0.21050756901157613, + "grad_norm": 0.7657596468925476, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7471131682395935, + "num_tokens": 170221501.0, + "step": 591 + }, + { + "epoch": 0.21086375779162955, + "grad_norm": 0.718740701675415, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7418027818202972, + "num_tokens": 170492280.0, + "step": 592 + }, + { + "epoch": 0.211219946571683, + "grad_norm": 0.6820559501647949, + "learning_rate": 1e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7804529964923859, + "num_tokens": 170795630.0, + "step": 593 + }, + { + "epoch": 0.21157613535173642, + "grad_norm": 0.7484424114227295, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7306412160396576, + "num_tokens": 171043669.0, + "step": 594 + }, + { + "epoch": 0.21193232413178986, + "grad_norm": 0.6743587851524353, + "learning_rate": 1e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7587456107139587, + "num_tokens": 171329623.0, + "step": 595 + }, + { + "epoch": 0.21228851291184328, + "grad_norm": 0.6795403361320496, + "learning_rate": 1e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7566312998533249, + "num_tokens": 171633467.0, + "step": 596 + }, + { + "epoch": 0.2126447016918967, + "grad_norm": 0.6697918176651001, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7546685189008713, + "num_tokens": 171934426.0, + "step": 597 + }, + { + "epoch": 0.21300089047195014, + "grad_norm": 0.6765775084495544, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7601828575134277, + "num_tokens": 172252470.0, + "step": 598 + }, + { + "epoch": 0.21335707925200356, + "grad_norm": 0.7030366063117981, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7511226534843445, + "num_tokens": 172542109.0, + "step": 599 + }, + { + "epoch": 0.21371326803205698, + "grad_norm": 0.6936545968055725, + "learning_rate": 1e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7527179419994354, + "num_tokens": 172822978.0, + "step": 600 + }, + { + "epoch": 0.21406945681211043, + "grad_norm": 0.7172664999961853, + "learning_rate": 1e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7544174492359161, + "num_tokens": 173096646.0, + "step": 601 + }, + { + "epoch": 0.21442564559216384, + "grad_norm": 0.6978586316108704, + "learning_rate": 1e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7618549913167953, + "num_tokens": 173373024.0, + "step": 602 + }, + { + "epoch": 0.21478183437221726, + "grad_norm": 0.7249910235404968, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7459557205438614, + "num_tokens": 173667351.0, + "step": 603 + }, + { + "epoch": 0.2151380231522707, + "grad_norm": 0.7335538268089294, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7439762204885483, + "num_tokens": 173939767.0, + "step": 604 + }, + { + "epoch": 0.21549421193232413, + "grad_norm": 0.6866166591644287, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7436898648738861, + "num_tokens": 174223573.0, + "step": 605 + }, + { + "epoch": 0.21585040071237757, + "grad_norm": 0.6658654808998108, + "learning_rate": 1e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7628216743469238, + "num_tokens": 174540307.0, + "step": 606 + }, + { + "epoch": 0.216206589492431, + "grad_norm": 0.7408151030540466, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7319081425666809, + "num_tokens": 174816593.0, + "step": 607 + }, + { + "epoch": 0.2165627782724844, + "grad_norm": 0.6846169829368591, + "learning_rate": 1e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.7399078756570816, + "num_tokens": 175124934.0, + "step": 608 + }, + { + "epoch": 0.21691896705253785, + "grad_norm": 0.7195557951927185, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7411320507526398, + "num_tokens": 175411897.0, + "step": 609 + }, + { + "epoch": 0.21727515583259127, + "grad_norm": 0.716387152671814, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7486654072999954, + "num_tokens": 175693895.0, + "step": 610 + }, + { + "epoch": 0.2176313446126447, + "grad_norm": 0.7180513143539429, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7338770627975464, + "num_tokens": 175967375.0, + "step": 611 + }, + { + "epoch": 0.21798753339269814, + "grad_norm": 0.7020632028579712, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7420262545347214, + "num_tokens": 176239939.0, + "step": 612 + }, + { + "epoch": 0.21834372217275155, + "grad_norm": 0.7205381393432617, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7526433616876602, + "num_tokens": 176508622.0, + "step": 613 + }, + { + "epoch": 0.218699910952805, + "grad_norm": 0.7182193398475647, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7348711937665939, + "num_tokens": 176800458.0, + "step": 614 + }, + { + "epoch": 0.21905609973285842, + "grad_norm": 0.7145103216171265, + "learning_rate": 1e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7470154464244843, + "num_tokens": 177087128.0, + "step": 615 + }, + { + "epoch": 0.21941228851291183, + "grad_norm": 0.6423900127410889, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7467617988586426, + "num_tokens": 177402096.0, + "step": 616 + }, + { + "epoch": 0.21976847729296528, + "grad_norm": 0.6609490513801575, + "learning_rate": 1e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.7646061331033707, + "num_tokens": 177709965.0, + "step": 617 + }, + { + "epoch": 0.2201246660730187, + "grad_norm": 0.7653310894966125, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7572337239980698, + "num_tokens": 177949951.0, + "step": 618 + }, + { + "epoch": 0.22048085485307212, + "grad_norm": 0.7397819757461548, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7389068007469177, + "num_tokens": 178225023.0, + "step": 619 + }, + { + "epoch": 0.22083704363312556, + "grad_norm": 0.7281933426856995, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.732717365026474, + "num_tokens": 178482035.0, + "step": 620 + }, + { + "epoch": 0.22119323241317898, + "grad_norm": 0.688987135887146, + "learning_rate": 1e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7466321140527725, + "num_tokens": 178788829.0, + "step": 621 + }, + { + "epoch": 0.22154942119323243, + "grad_norm": 0.7982390522956848, + "learning_rate": 1e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7555684596300125, + "num_tokens": 179056296.0, + "step": 622 + }, + { + "epoch": 0.22190560997328584, + "grad_norm": 0.7428393959999084, + "learning_rate": 1e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7612296640872955, + "num_tokens": 179316951.0, + "step": 623 + }, + { + "epoch": 0.22226179875333926, + "grad_norm": 0.6899006962776184, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7276035696268082, + "num_tokens": 179601114.0, + "step": 624 + }, + { + "epoch": 0.2226179875333927, + "grad_norm": 0.6782673001289368, + "learning_rate": 1e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7542476654052734, + "num_tokens": 179908219.0, + "step": 625 + }, + { + "epoch": 0.22297417631344613, + "grad_norm": 0.6694067716598511, + "learning_rate": 1e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7469710409641266, + "num_tokens": 180198196.0, + "step": 626 + }, + { + "epoch": 0.22333036509349954, + "grad_norm": 0.7254921793937683, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7399675995111465, + "num_tokens": 180481440.0, + "step": 627 + }, + { + "epoch": 0.223686553873553, + "grad_norm": 0.7200340032577515, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7451265752315521, + "num_tokens": 180767807.0, + "step": 628 + }, + { + "epoch": 0.2240427426536064, + "grad_norm": 0.6903771758079529, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.734050452709198, + "num_tokens": 181073745.0, + "step": 629 + }, + { + "epoch": 0.22439893143365983, + "grad_norm": 0.7514396905899048, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7493252754211426, + "num_tokens": 181357170.0, + "step": 630 + }, + { + "epoch": 0.22475512021371327, + "grad_norm": 0.6830565929412842, + "learning_rate": 1e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7439787238836288, + "num_tokens": 181640875.0, + "step": 631 + }, + { + "epoch": 0.2251113089937667, + "grad_norm": 0.7044780254364014, + "learning_rate": 1e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7464984208345413, + "num_tokens": 181962637.0, + "step": 632 + }, + { + "epoch": 0.22546749777382014, + "grad_norm": 0.7395034432411194, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7310810089111328, + "num_tokens": 182226154.0, + "step": 633 + }, + { + "epoch": 0.22582368655387355, + "grad_norm": 0.7178677320480347, + "learning_rate": 1e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7414314299821854, + "num_tokens": 182499287.0, + "step": 634 + }, + { + "epoch": 0.22617987533392697, + "grad_norm": 0.7033289074897766, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7399980574846268, + "num_tokens": 182777010.0, + "step": 635 + }, + { + "epoch": 0.22653606411398042, + "grad_norm": 0.6953285932540894, + "learning_rate": 1e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7571704983711243, + "num_tokens": 183066730.0, + "step": 636 + }, + { + "epoch": 0.22689225289403384, + "grad_norm": 0.6924843788146973, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7499413341283798, + "num_tokens": 183388477.0, + "step": 637 + }, + { + "epoch": 0.22724844167408725, + "grad_norm": 0.7293652296066284, + "learning_rate": 1e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7622853964567184, + "num_tokens": 183649802.0, + "step": 638 + }, + { + "epoch": 0.2276046304541407, + "grad_norm": 0.6816263794898987, + "learning_rate": 1e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7482298612594604, + "num_tokens": 183931183.0, + "step": 639 + }, + { + "epoch": 0.22796081923419412, + "grad_norm": 0.6945924162864685, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7544895708560944, + "num_tokens": 184209585.0, + "step": 640 + }, + { + "epoch": 0.22831700801424756, + "grad_norm": 0.6166104078292847, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7487052828073502, + "num_tokens": 184550156.0, + "step": 641 + }, + { + "epoch": 0.22867319679430098, + "grad_norm": 0.6920416355133057, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7490693479776382, + "num_tokens": 184862332.0, + "step": 642 + }, + { + "epoch": 0.2290293855743544, + "grad_norm": 0.7212904691696167, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7497946321964264, + "num_tokens": 185142680.0, + "step": 643 + }, + { + "epoch": 0.22938557435440785, + "grad_norm": 0.6863221526145935, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.754435196518898, + "num_tokens": 185440103.0, + "step": 644 + }, + { + "epoch": 0.22974176313446126, + "grad_norm": 0.6865999698638916, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7585329413414001, + "num_tokens": 185723800.0, + "step": 645 + }, + { + "epoch": 0.23009795191451468, + "grad_norm": 0.7099013924598694, + "learning_rate": 1e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7489997148513794, + "num_tokens": 186033569.0, + "step": 646 + }, + { + "epoch": 0.23045414069456813, + "grad_norm": 0.6828547120094299, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7281650304794312, + "num_tokens": 186359925.0, + "step": 647 + }, + { + "epoch": 0.23081032947462155, + "grad_norm": 0.73198401927948, + "learning_rate": 1e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7453551888465881, + "num_tokens": 186622644.0, + "step": 648 + }, + { + "epoch": 0.231166518254675, + "grad_norm": 0.7392703890800476, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7362889051437378, + "num_tokens": 186893002.0, + "step": 649 + }, + { + "epoch": 0.2315227070347284, + "grad_norm": 0.7091670632362366, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.738905668258667, + "num_tokens": 187179785.0, + "step": 650 + }, + { + "epoch": 0.23187889581478183, + "grad_norm": 0.647760808467865, + "learning_rate": 1e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7554707825183868, + "num_tokens": 187507047.0, + "step": 651 + }, + { + "epoch": 0.23223508459483527, + "grad_norm": 0.6605339050292969, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7313409596681595, + "num_tokens": 187807539.0, + "step": 652 + }, + { + "epoch": 0.2325912733748887, + "grad_norm": 0.7139881253242493, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7461802661418915, + "num_tokens": 188073935.0, + "step": 653 + }, + { + "epoch": 0.2329474621549421, + "grad_norm": 0.6743558049201965, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7426755279302597, + "num_tokens": 188376447.0, + "step": 654 + }, + { + "epoch": 0.23330365093499555, + "grad_norm": 0.7488811016082764, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7383992075920105, + "num_tokens": 188647197.0, + "step": 655 + }, + { + "epoch": 0.23365983971504897, + "grad_norm": 0.7126250863075256, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7527973651885986, + "num_tokens": 188934338.0, + "step": 656 + }, + { + "epoch": 0.2340160284951024, + "grad_norm": 0.7192845344543457, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.76469986140728, + "num_tokens": 189184814.0, + "step": 657 + }, + { + "epoch": 0.23437221727515584, + "grad_norm": 0.7056124806404114, + "learning_rate": 1e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7436462193727493, + "num_tokens": 189467315.0, + "step": 658 + }, + { + "epoch": 0.23472840605520925, + "grad_norm": 0.7556808590888977, + "learning_rate": 1e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7598977237939835, + "num_tokens": 189731902.0, + "step": 659 + }, + { + "epoch": 0.2350845948352627, + "grad_norm": 0.6923025250434875, + "learning_rate": 1e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7555659115314484, + "num_tokens": 190026347.0, + "step": 660 + }, + { + "epoch": 0.23544078361531612, + "grad_norm": 0.6534726023674011, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7588642686605453, + "num_tokens": 190336334.0, + "step": 661 + }, + { + "epoch": 0.23579697239536954, + "grad_norm": 0.7234770059585571, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7592008262872696, + "num_tokens": 190602216.0, + "step": 662 + }, + { + "epoch": 0.23615316117542298, + "grad_norm": 0.7370746731758118, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7632476836442947, + "num_tokens": 190867598.0, + "step": 663 + }, + { + "epoch": 0.2365093499554764, + "grad_norm": 0.7284038662910461, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.759672999382019, + "num_tokens": 191138307.0, + "step": 664 + }, + { + "epoch": 0.23686553873552982, + "grad_norm": 0.6459569931030273, + "learning_rate": 1e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7566838711500168, + "num_tokens": 191450734.0, + "step": 665 + }, + { + "epoch": 0.23722172751558326, + "grad_norm": 0.670545220375061, + "learning_rate": 1e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.759641483426094, + "num_tokens": 191780726.0, + "step": 666 + }, + { + "epoch": 0.23757791629563668, + "grad_norm": 0.6999671459197998, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7651474326848984, + "num_tokens": 192061980.0, + "step": 667 + }, + { + "epoch": 0.23793410507569013, + "grad_norm": 0.7320982813835144, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7342398017644882, + "num_tokens": 192361144.0, + "step": 668 + }, + { + "epoch": 0.23829029385574355, + "grad_norm": 0.6931456327438354, + "learning_rate": 1e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7606761157512665, + "num_tokens": 192656691.0, + "step": 669 + }, + { + "epoch": 0.23864648263579696, + "grad_norm": 0.6822105646133423, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7335691004991531, + "num_tokens": 192954474.0, + "step": 670 + }, + { + "epoch": 0.2390026714158504, + "grad_norm": 0.7710821628570557, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7401488423347473, + "num_tokens": 193223451.0, + "step": 671 + }, + { + "epoch": 0.23935886019590383, + "grad_norm": 0.7434242963790894, + "learning_rate": 1e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7519496381282806, + "num_tokens": 193498587.0, + "step": 672 + }, + { + "epoch": 0.23971504897595725, + "grad_norm": 0.7127635478973389, + "learning_rate": 1e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7536232322454453, + "num_tokens": 193779118.0, + "step": 673 + }, + { + "epoch": 0.2400712377560107, + "grad_norm": 0.6454879641532898, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7639312595129013, + "num_tokens": 194102314.0, + "step": 674 + }, + { + "epoch": 0.2404274265360641, + "grad_norm": 0.7799111008644104, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7415604144334793, + "num_tokens": 194343958.0, + "step": 675 + }, + { + "epoch": 0.24078361531611756, + "grad_norm": 0.7081688046455383, + "learning_rate": 1e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.7523187100887299, + "num_tokens": 194621986.0, + "step": 676 + }, + { + "epoch": 0.24113980409617097, + "grad_norm": 0.6567289233207703, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7381274402141571, + "num_tokens": 194956573.0, + "step": 677 + }, + { + "epoch": 0.2414959928762244, + "grad_norm": 0.6999390125274658, + "learning_rate": 1e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7557232528924942, + "num_tokens": 195265661.0, + "step": 678 + }, + { + "epoch": 0.24185218165627784, + "grad_norm": 0.6988896727561951, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.755840003490448, + "num_tokens": 195571139.0, + "step": 679 + }, + { + "epoch": 0.24220837043633126, + "grad_norm": 0.7274938821792603, + "learning_rate": 1e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7449721843004227, + "num_tokens": 195840308.0, + "step": 680 + }, + { + "epoch": 0.24256455921638467, + "grad_norm": 0.7091554999351501, + "learning_rate": 1e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7656887769699097, + "num_tokens": 196121665.0, + "step": 681 + }, + { + "epoch": 0.24292074799643812, + "grad_norm": 0.7042673826217651, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7491706907749176, + "num_tokens": 196394517.0, + "step": 682 + }, + { + "epoch": 0.24327693677649154, + "grad_norm": 0.7227875590324402, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.745087206363678, + "num_tokens": 196701550.0, + "step": 683 + }, + { + "epoch": 0.24363312555654498, + "grad_norm": 0.6980577111244202, + "learning_rate": 1e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7402597963809967, + "num_tokens": 196973596.0, + "step": 684 + }, + { + "epoch": 0.2439893143365984, + "grad_norm": 0.6674192547798157, + "learning_rate": 1e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7534281462430954, + "num_tokens": 197278934.0, + "step": 685 + }, + { + "epoch": 0.24434550311665182, + "grad_norm": 0.7565862536430359, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7307509630918503, + "num_tokens": 197546893.0, + "step": 686 + }, + { + "epoch": 0.24470169189670526, + "grad_norm": 0.7578544616699219, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7445889115333557, + "num_tokens": 197821356.0, + "step": 687 + }, + { + "epoch": 0.24505788067675868, + "grad_norm": 0.6659411191940308, + "learning_rate": 1e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7577791512012482, + "num_tokens": 198127688.0, + "step": 688 + }, + { + "epoch": 0.2454140694568121, + "grad_norm": 0.7317038774490356, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7416885197162628, + "num_tokens": 198425572.0, + "step": 689 + }, + { + "epoch": 0.24577025823686555, + "grad_norm": 0.7051808834075928, + "learning_rate": 1e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7587350308895111, + "num_tokens": 198724269.0, + "step": 690 + }, + { + "epoch": 0.24612644701691896, + "grad_norm": 0.6917704939842224, + "learning_rate": 1e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.7597584873437881, + "num_tokens": 199012540.0, + "step": 691 + }, + { + "epoch": 0.24648263579697238, + "grad_norm": 0.7080327868461609, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7365793138742447, + "num_tokens": 199306655.0, + "step": 692 + }, + { + "epoch": 0.24683882457702583, + "grad_norm": 0.7196453809738159, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7522776871919632, + "num_tokens": 199586117.0, + "step": 693 + }, + { + "epoch": 0.24719501335707925, + "grad_norm": 0.6558706164360046, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7712924182415009, + "num_tokens": 199895631.0, + "step": 694 + }, + { + "epoch": 0.2475512021371327, + "grad_norm": 0.72367262840271, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7401653975248337, + "num_tokens": 200170063.0, + "step": 695 + }, + { + "epoch": 0.2479073909171861, + "grad_norm": 0.7270874381065369, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7502354681491852, + "num_tokens": 200455377.0, + "step": 696 + }, + { + "epoch": 0.24826357969723953, + "grad_norm": 0.7190952897071838, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7377060055732727, + "num_tokens": 200728598.0, + "step": 697 + }, + { + "epoch": 0.24861976847729297, + "grad_norm": 0.7311384677886963, + "learning_rate": 1e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7519445270299911, + "num_tokens": 200988066.0, + "step": 698 + }, + { + "epoch": 0.2489759572573464, + "grad_norm": 0.6993615031242371, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7460096180438995, + "num_tokens": 201265648.0, + "step": 699 + }, + { + "epoch": 0.2493321460373998, + "grad_norm": 0.699140727519989, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7381905913352966, + "num_tokens": 201579442.0, + "step": 700 + }, + { + "epoch": 0.24968833481745326, + "grad_norm": 0.704163670539856, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7469717711210251, + "num_tokens": 201844613.0, + "step": 701 + }, + { + "epoch": 0.2500445235975067, + "grad_norm": 0.7185049653053284, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7423136383295059, + "num_tokens": 202109017.0, + "step": 702 + }, + { + "epoch": 0.2504007123775601, + "grad_norm": 0.8205074667930603, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7342620491981506, + "num_tokens": 202395331.0, + "step": 703 + }, + { + "epoch": 0.2507569011576135, + "grad_norm": 0.695286750793457, + "learning_rate": 1e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7600405067205429, + "num_tokens": 202699485.0, + "step": 704 + }, + { + "epoch": 0.25111308993766696, + "grad_norm": 0.7136188745498657, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7436291724443436, + "num_tokens": 203000560.0, + "step": 705 + }, + { + "epoch": 0.2514692787177204, + "grad_norm": 0.7177485823631287, + "learning_rate": 1e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7576039284467697, + "num_tokens": 203265054.0, + "step": 706 + }, + { + "epoch": 0.25182546749777385, + "grad_norm": 0.7639326453208923, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7445334494113922, + "num_tokens": 203517775.0, + "step": 707 + }, + { + "epoch": 0.25218165627782724, + "grad_norm": 0.7297584414482117, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7422369718551636, + "num_tokens": 203801947.0, + "step": 708 + }, + { + "epoch": 0.2525378450578807, + "grad_norm": 0.655819296836853, + "learning_rate": 1e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.7524457424879074, + "num_tokens": 204112558.0, + "step": 709 + }, + { + "epoch": 0.25289403383793413, + "grad_norm": 0.7249686121940613, + "learning_rate": 1e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.746833324432373, + "num_tokens": 204386920.0, + "step": 710 + }, + { + "epoch": 0.2532502226179875, + "grad_norm": 0.6992975473403931, + "learning_rate": 1e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7684835195541382, + "num_tokens": 204682128.0, + "step": 711 + }, + { + "epoch": 0.25360641139804097, + "grad_norm": 0.7084347009658813, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7412633299827576, + "num_tokens": 204972909.0, + "step": 712 + }, + { + "epoch": 0.2539626001780944, + "grad_norm": 0.6739515066146851, + "learning_rate": 1e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7491350769996643, + "num_tokens": 205283585.0, + "step": 713 + }, + { + "epoch": 0.2543187889581478, + "grad_norm": 0.7178598046302795, + "learning_rate": 1e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.747789278626442, + "num_tokens": 205564787.0, + "step": 714 + }, + { + "epoch": 0.25467497773820125, + "grad_norm": 0.6992794275283813, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7376389652490616, + "num_tokens": 205884707.0, + "step": 715 + }, + { + "epoch": 0.2550311665182547, + "grad_norm": 0.6739900708198547, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.750601664185524, + "num_tokens": 206173716.0, + "step": 716 + }, + { + "epoch": 0.2553873552983081, + "grad_norm": 0.7082987427711487, + "learning_rate": 1e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7585623413324356, + "num_tokens": 206451566.0, + "step": 717 + }, + { + "epoch": 0.25574354407836153, + "grad_norm": 0.6578952074050903, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7491383105516434, + "num_tokens": 206767673.0, + "step": 718 + }, + { + "epoch": 0.256099732858415, + "grad_norm": 0.6633303761482239, + "learning_rate": 1e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.7537440657615662, + "num_tokens": 207102767.0, + "step": 719 + }, + { + "epoch": 0.25645592163846836, + "grad_norm": 0.6737834811210632, + "learning_rate": 1e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7709829807281494, + "num_tokens": 207372706.0, + "step": 720 + }, + { + "epoch": 0.2568121104185218, + "grad_norm": 0.7115257382392883, + "learning_rate": 1e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7534657269716263, + "num_tokens": 207650152.0, + "step": 721 + }, + { + "epoch": 0.25716829919857526, + "grad_norm": 0.6705921292304993, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.750194638967514, + "num_tokens": 207949184.0, + "step": 722 + }, + { + "epoch": 0.25752448797862865, + "grad_norm": 0.7154313325881958, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.740323394536972, + "num_tokens": 208210832.0, + "step": 723 + }, + { + "epoch": 0.2578806767586821, + "grad_norm": 0.7453343868255615, + "learning_rate": 1e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.7523172199726105, + "num_tokens": 208480077.0, + "step": 724 + }, + { + "epoch": 0.25823686553873554, + "grad_norm": 0.6871276497840881, + "learning_rate": 1e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7603102326393127, + "num_tokens": 208764080.0, + "step": 725 + }, + { + "epoch": 0.258593054318789, + "grad_norm": 0.6767525672912598, + "learning_rate": 1e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7531595230102539, + "num_tokens": 209081363.0, + "step": 726 + }, + { + "epoch": 0.2589492430988424, + "grad_norm": 0.7176051139831543, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7338608056306839, + "num_tokens": 209376580.0, + "step": 727 + }, + { + "epoch": 0.2593054318788958, + "grad_norm": 0.6770071983337402, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7449493557214737, + "num_tokens": 209699634.0, + "step": 728 + }, + { + "epoch": 0.25966162065894927, + "grad_norm": 0.7107053995132446, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7255515605211258, + "num_tokens": 209987117.0, + "step": 729 + }, + { + "epoch": 0.26001780943900266, + "grad_norm": 0.7009879946708679, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7578239440917969, + "num_tokens": 210265775.0, + "step": 730 + }, + { + "epoch": 0.2603739982190561, + "grad_norm": 0.7117840051651001, + "learning_rate": 1e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.7610590606927872, + "num_tokens": 210555925.0, + "step": 731 + }, + { + "epoch": 0.26073018699910955, + "grad_norm": 0.7364587783813477, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7505362033843994, + "num_tokens": 210823594.0, + "step": 732 + }, + { + "epoch": 0.26108637577916294, + "grad_norm": 0.7315259575843811, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7404962927103043, + "num_tokens": 211128489.0, + "step": 733 + }, + { + "epoch": 0.2614425645592164, + "grad_norm": 0.7419450283050537, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7429973781108856, + "num_tokens": 211396673.0, + "step": 734 + }, + { + "epoch": 0.26179875333926983, + "grad_norm": 0.6905232667922974, + "learning_rate": 1e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7509350776672363, + "num_tokens": 211687889.0, + "step": 735 + }, + { + "epoch": 0.2621549421193232, + "grad_norm": 0.6519961357116699, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.76048943400383, + "num_tokens": 211987608.0, + "step": 736 + }, + { + "epoch": 0.26251113089937667, + "grad_norm": 0.6874048113822937, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7498375177383423, + "num_tokens": 212307179.0, + "step": 737 + }, + { + "epoch": 0.2628673196794301, + "grad_norm": 0.6653056740760803, + "learning_rate": 1e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7491779178380966, + "num_tokens": 212607868.0, + "step": 738 + }, + { + "epoch": 0.2632235084594835, + "grad_norm": 0.737671434879303, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7394831329584122, + "num_tokens": 212864058.0, + "step": 739 + }, + { + "epoch": 0.26357969723953695, + "grad_norm": 0.6643927097320557, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7447740584611893, + "num_tokens": 213159238.0, + "step": 740 + }, + { + "epoch": 0.2639358860195904, + "grad_norm": 0.718214213848114, + "learning_rate": 1e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7637405544519424, + "num_tokens": 213417895.0, + "step": 741 + }, + { + "epoch": 0.2642920747996438, + "grad_norm": 0.7283039093017578, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7432980537414551, + "num_tokens": 213678423.0, + "step": 742 + }, + { + "epoch": 0.26464826357969723, + "grad_norm": 0.6992026567459106, + "learning_rate": 1e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7576179653406143, + "num_tokens": 213951108.0, + "step": 743 + }, + { + "epoch": 0.2650044523597507, + "grad_norm": 0.7239747047424316, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7411033064126968, + "num_tokens": 214223855.0, + "step": 744 + }, + { + "epoch": 0.2653606411398041, + "grad_norm": 0.7080898284912109, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7614042460918427, + "num_tokens": 214511110.0, + "step": 745 + }, + { + "epoch": 0.2657168299198575, + "grad_norm": 0.7156978249549866, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7472742646932602, + "num_tokens": 214779447.0, + "step": 746 + }, + { + "epoch": 0.26607301869991096, + "grad_norm": 0.693128764629364, + "learning_rate": 1e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7555952370166779, + "num_tokens": 215065336.0, + "step": 747 + }, + { + "epoch": 0.2664292074799644, + "grad_norm": 0.6802164912223816, + "learning_rate": 1e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7516331225633621, + "num_tokens": 215375102.0, + "step": 748 + }, + { + "epoch": 0.2667853962600178, + "grad_norm": 0.6883007287979126, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7452035248279572, + "num_tokens": 215647051.0, + "step": 749 + }, + { + "epoch": 0.26714158504007124, + "grad_norm": 0.6887452602386475, + "learning_rate": 1e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7530091404914856, + "num_tokens": 215934404.0, + "step": 750 + }, + { + "epoch": 0.2674977738201247, + "grad_norm": 0.7235872745513916, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7512509375810623, + "num_tokens": 216199947.0, + "step": 751 + }, + { + "epoch": 0.2678539626001781, + "grad_norm": 0.7168263792991638, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7484491914510727, + "num_tokens": 216481795.0, + "step": 752 + }, + { + "epoch": 0.2682101513802315, + "grad_norm": 0.6660345792770386, + "learning_rate": 1e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7600079029798508, + "num_tokens": 216777510.0, + "step": 753 + }, + { + "epoch": 0.26856634016028497, + "grad_norm": 0.6318036913871765, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7517234981060028, + "num_tokens": 217133096.0, + "step": 754 + }, + { + "epoch": 0.26892252894033836, + "grad_norm": 0.6929224133491516, + "learning_rate": 1e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7534524202346802, + "num_tokens": 217434680.0, + "step": 755 + }, + { + "epoch": 0.2692787177203918, + "grad_norm": 0.6276294589042664, + "learning_rate": 1e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7493168711662292, + "num_tokens": 217764064.0, + "step": 756 + }, + { + "epoch": 0.26963490650044525, + "grad_norm": 0.6782474517822266, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7472713738679886, + "num_tokens": 218081524.0, + "step": 757 + }, + { + "epoch": 0.26999109528049864, + "grad_norm": 0.6609413027763367, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7415159940719604, + "num_tokens": 218394996.0, + "step": 758 + }, + { + "epoch": 0.2703472840605521, + "grad_norm": 0.6771866679191589, + "learning_rate": 1e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.759042426943779, + "num_tokens": 218708474.0, + "step": 759 + }, + { + "epoch": 0.27070347284060553, + "grad_norm": 0.7062015533447266, + "learning_rate": 1e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7429337650537491, + "num_tokens": 218987867.0, + "step": 760 + }, + { + "epoch": 0.271059661620659, + "grad_norm": 0.7284206748008728, + "learning_rate": 1e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7440585196018219, + "num_tokens": 219253144.0, + "step": 761 + }, + { + "epoch": 0.27141585040071237, + "grad_norm": 0.6721488237380981, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7525697946548462, + "num_tokens": 219559946.0, + "step": 762 + }, + { + "epoch": 0.2717720391807658, + "grad_norm": 0.6966146230697632, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7465904504060745, + "num_tokens": 219854854.0, + "step": 763 + }, + { + "epoch": 0.27212822796081926, + "grad_norm": 0.7214687466621399, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7546590119600296, + "num_tokens": 220134088.0, + "step": 764 + }, + { + "epoch": 0.27248441674087265, + "grad_norm": 0.6731786727905273, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7361244559288025, + "num_tokens": 220438530.0, + "step": 765 + }, + { + "epoch": 0.2728406055209261, + "grad_norm": 0.663239061832428, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7345352917909622, + "num_tokens": 220745575.0, + "step": 766 + }, + { + "epoch": 0.27319679430097954, + "grad_norm": 0.6731104254722595, + "learning_rate": 1e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7620676606893539, + "num_tokens": 221039595.0, + "step": 767 + }, + { + "epoch": 0.27355298308103293, + "grad_norm": 0.6507112383842468, + "learning_rate": 1e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7586677521467209, + "num_tokens": 221363707.0, + "step": 768 + }, + { + "epoch": 0.2739091718610864, + "grad_norm": 0.660070538520813, + "learning_rate": 1e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7692445367574692, + "num_tokens": 221649993.0, + "step": 769 + }, + { + "epoch": 0.2742653606411398, + "grad_norm": 0.7332190275192261, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.748064711689949, + "num_tokens": 221914625.0, + "step": 770 + }, + { + "epoch": 0.2746215494211932, + "grad_norm": 0.699192225933075, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7350447624921799, + "num_tokens": 222202406.0, + "step": 771 + }, + { + "epoch": 0.27497773820124666, + "grad_norm": 0.6606435775756836, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7428998202085495, + "num_tokens": 222506128.0, + "step": 772 + }, + { + "epoch": 0.2753339269813001, + "grad_norm": 0.6447018980979919, + "learning_rate": 1e-06, + "loss": 0.808, + "mean_token_accuracy": 0.7517787665128708, + "num_tokens": 222835840.0, + "step": 773 + }, + { + "epoch": 0.2756901157613535, + "grad_norm": 0.6870838403701782, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7543943673372269, + "num_tokens": 223133876.0, + "step": 774 + }, + { + "epoch": 0.27604630454140694, + "grad_norm": 0.7193928360939026, + "learning_rate": 1e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7542615383863449, + "num_tokens": 223426170.0, + "step": 775 + }, + { + "epoch": 0.2764024933214604, + "grad_norm": 0.6818604469299316, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7628755122423172, + "num_tokens": 223719342.0, + "step": 776 + }, + { + "epoch": 0.2767586821015138, + "grad_norm": 0.6946378946304321, + "learning_rate": 1e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7536285668611526, + "num_tokens": 224029140.0, + "step": 777 + }, + { + "epoch": 0.2771148708815672, + "grad_norm": 0.7441150546073914, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7546705454587936, + "num_tokens": 224292034.0, + "step": 778 + }, + { + "epoch": 0.27747105966162067, + "grad_norm": 0.7676928639411926, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7463355511426926, + "num_tokens": 224565592.0, + "step": 779 + }, + { + "epoch": 0.2778272484416741, + "grad_norm": 0.7401454448699951, + "learning_rate": 1e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.761330708861351, + "num_tokens": 224852048.0, + "step": 780 + }, + { + "epoch": 0.2781834372217275, + "grad_norm": 0.7248575687408447, + "learning_rate": 1e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.7537535279989243, + "num_tokens": 225114053.0, + "step": 781 + }, + { + "epoch": 0.27853962600178095, + "grad_norm": 0.7168235182762146, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7429585307836533, + "num_tokens": 225375517.0, + "step": 782 + }, + { + "epoch": 0.2788958147818344, + "grad_norm": 0.7789493799209595, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7346650958061218, + "num_tokens": 225623135.0, + "step": 783 + }, + { + "epoch": 0.2792520035618878, + "grad_norm": 0.6482436060905457, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7642972767353058, + "num_tokens": 225914562.0, + "step": 784 + }, + { + "epoch": 0.27960819234194123, + "grad_norm": 0.6988193988800049, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7610199153423309, + "num_tokens": 226229617.0, + "step": 785 + }, + { + "epoch": 0.2799643811219947, + "grad_norm": 0.7552284002304077, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7406427264213562, + "num_tokens": 226498586.0, + "step": 786 + }, + { + "epoch": 0.28032056990204807, + "grad_norm": 0.7348888516426086, + "learning_rate": 1e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7451271265745163, + "num_tokens": 226764788.0, + "step": 787 + }, + { + "epoch": 0.2806767586821015, + "grad_norm": 0.7152369618415833, + "learning_rate": 1e-06, + "loss": 0.7498, + "mean_token_accuracy": 0.7676392197608948, + "num_tokens": 227070211.0, + "step": 788 + }, + { + "epoch": 0.28103294746215496, + "grad_norm": 0.7334146499633789, + "learning_rate": 1e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7538636922836304, + "num_tokens": 227351893.0, + "step": 789 + }, + { + "epoch": 0.28138913624220835, + "grad_norm": 0.7707662582397461, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7358496487140656, + "num_tokens": 227641623.0, + "step": 790 + }, + { + "epoch": 0.2817453250222618, + "grad_norm": 0.7308485507965088, + "learning_rate": 1e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7609463781118393, + "num_tokens": 227923831.0, + "step": 791 + }, + { + "epoch": 0.28210151380231524, + "grad_norm": 0.7067363262176514, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7413235902786255, + "num_tokens": 228208058.0, + "step": 792 + }, + { + "epoch": 0.28245770258236863, + "grad_norm": 0.7373730540275574, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7450270503759384, + "num_tokens": 228477427.0, + "step": 793 + }, + { + "epoch": 0.2828138913624221, + "grad_norm": 0.7207044959068298, + "learning_rate": 1e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7517948150634766, + "num_tokens": 228782585.0, + "step": 794 + }, + { + "epoch": 0.2831700801424755, + "grad_norm": 0.7723159193992615, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7588718086481094, + "num_tokens": 229068356.0, + "step": 795 + }, + { + "epoch": 0.28352626892252897, + "grad_norm": 0.6924941539764404, + "learning_rate": 1e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.753755733370781, + "num_tokens": 229368414.0, + "step": 796 + }, + { + "epoch": 0.28388245770258236, + "grad_norm": 0.6777812242507935, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7419353872537613, + "num_tokens": 229670821.0, + "step": 797 + }, + { + "epoch": 0.2842386464826358, + "grad_norm": 0.7250900268554688, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7394212186336517, + "num_tokens": 229968592.0, + "step": 798 + }, + { + "epoch": 0.28459483526268925, + "grad_norm": 0.736255943775177, + "learning_rate": 1e-06, + "loss": 0.7498, + "mean_token_accuracy": 0.7619789838790894, + "num_tokens": 230245460.0, + "step": 799 + }, + { + "epoch": 0.28495102404274264, + "grad_norm": 0.7272618412971497, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7367000877857208, + "num_tokens": 230521966.0, + "step": 800 + }, + { + "epoch": 0.2853072128227961, + "grad_norm": 0.689778208732605, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7286136597394943, + "num_tokens": 230829228.0, + "step": 801 + }, + { + "epoch": 0.28566340160284953, + "grad_norm": 0.7487449645996094, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7562494874000549, + "num_tokens": 231073319.0, + "step": 802 + }, + { + "epoch": 0.2860195903829029, + "grad_norm": 0.6909729242324829, + "learning_rate": 1e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7607486397027969, + "num_tokens": 231350174.0, + "step": 803 + }, + { + "epoch": 0.28637577916295637, + "grad_norm": 0.7425106167793274, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7456308156251907, + "num_tokens": 231634996.0, + "step": 804 + }, + { + "epoch": 0.2867319679430098, + "grad_norm": 0.6934384703636169, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7318065166473389, + "num_tokens": 231918355.0, + "step": 805 + }, + { + "epoch": 0.2870881567230632, + "grad_norm": 0.7272054553031921, + "learning_rate": 1e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.75608791410923, + "num_tokens": 232189801.0, + "step": 806 + }, + { + "epoch": 0.28744434550311665, + "grad_norm": 0.7158402800559998, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.743634894490242, + "num_tokens": 232463013.0, + "step": 807 + }, + { + "epoch": 0.2878005342831701, + "grad_norm": 0.7105263471603394, + "learning_rate": 1e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7571685463190079, + "num_tokens": 232727710.0, + "step": 808 + }, + { + "epoch": 0.2881567230632235, + "grad_norm": 0.7258197069168091, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7500891536474228, + "num_tokens": 233001458.0, + "step": 809 + }, + { + "epoch": 0.28851291184327693, + "grad_norm": 0.7532241344451904, + "learning_rate": 1e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7472488582134247, + "num_tokens": 233285875.0, + "step": 810 + }, + { + "epoch": 0.2888691006233304, + "grad_norm": 0.6770374774932861, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7421658337116241, + "num_tokens": 233578907.0, + "step": 811 + }, + { + "epoch": 0.28922528940338377, + "grad_norm": 0.6595101356506348, + "learning_rate": 1e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7514682114124298, + "num_tokens": 233905090.0, + "step": 812 + }, + { + "epoch": 0.2895814781834372, + "grad_norm": 0.7218517661094666, + "learning_rate": 1e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7496904879808426, + "num_tokens": 234184734.0, + "step": 813 + }, + { + "epoch": 0.28993766696349066, + "grad_norm": 0.6923441886901855, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7609210908412933, + "num_tokens": 234497167.0, + "step": 814 + }, + { + "epoch": 0.2902938557435441, + "grad_norm": 0.7028517723083496, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7300514727830887, + "num_tokens": 234792469.0, + "step": 815 + }, + { + "epoch": 0.2906500445235975, + "grad_norm": 0.6961959004402161, + "learning_rate": 1e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.7581480294466019, + "num_tokens": 235071582.0, + "step": 816 + }, + { + "epoch": 0.29100623330365094, + "grad_norm": 0.7166313529014587, + "learning_rate": 1e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7643808126449585, + "num_tokens": 235339644.0, + "step": 817 + }, + { + "epoch": 0.2913624220837044, + "grad_norm": 0.6799490451812744, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7491903305053711, + "num_tokens": 235631084.0, + "step": 818 + }, + { + "epoch": 0.2917186108637578, + "grad_norm": 0.7097505331039429, + "learning_rate": 1e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7579211890697479, + "num_tokens": 235921113.0, + "step": 819 + }, + { + "epoch": 0.2920747996438112, + "grad_norm": 0.7227668166160583, + "learning_rate": 1e-06, + "loss": 0.7547, + "mean_token_accuracy": 0.7604870200157166, + "num_tokens": 236198722.0, + "step": 820 + }, + { + "epoch": 0.29243098842386467, + "grad_norm": 0.666581928730011, + "learning_rate": 1e-06, + "loss": 0.79, + "mean_token_accuracy": 0.752911388874054, + "num_tokens": 236496499.0, + "step": 821 + }, + { + "epoch": 0.29278717720391806, + "grad_norm": 0.6873966455459595, + "learning_rate": 1e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7632572650909424, + "num_tokens": 236775952.0, + "step": 822 + }, + { + "epoch": 0.2931433659839715, + "grad_norm": 0.6604158282279968, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7465717047452927, + "num_tokens": 237078822.0, + "step": 823 + }, + { + "epoch": 0.29349955476402495, + "grad_norm": 0.7251026034355164, + "learning_rate": 1e-06, + "loss": 0.76, + "mean_token_accuracy": 0.763690397143364, + "num_tokens": 237351636.0, + "step": 824 + }, + { + "epoch": 0.29385574354407834, + "grad_norm": 0.6868265271186829, + "learning_rate": 1e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7465769499540329, + "num_tokens": 237638530.0, + "step": 825 + }, + { + "epoch": 0.2942119323241318, + "grad_norm": 0.6572822332382202, + "learning_rate": 1e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.7550665140151978, + "num_tokens": 237942897.0, + "step": 826 + }, + { + "epoch": 0.29456812110418523, + "grad_norm": 0.7030782103538513, + "learning_rate": 1e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7500248700380325, + "num_tokens": 238207693.0, + "step": 827 + }, + { + "epoch": 0.2949243098842386, + "grad_norm": 0.6373904347419739, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7459231019020081, + "num_tokens": 238532642.0, + "step": 828 + }, + { + "epoch": 0.29528049866429207, + "grad_norm": 0.7168747186660767, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7271191030740738, + "num_tokens": 238830865.0, + "step": 829 + }, + { + "epoch": 0.2956366874443455, + "grad_norm": 0.7579995393753052, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7365850061178207, + "num_tokens": 239108097.0, + "step": 830 + }, + { + "epoch": 0.2959928762243989, + "grad_norm": 0.7020167112350464, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7214224189519882, + "num_tokens": 239394488.0, + "step": 831 + }, + { + "epoch": 0.29634906500445235, + "grad_norm": 0.7049803733825684, + "learning_rate": 1e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7552302181720734, + "num_tokens": 239680852.0, + "step": 832 + }, + { + "epoch": 0.2967052537845058, + "grad_norm": 0.7571737766265869, + "learning_rate": 1e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7521811127662659, + "num_tokens": 239942961.0, + "step": 833 + }, + { + "epoch": 0.29706144256455924, + "grad_norm": 0.7242873311042786, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.747767984867096, + "num_tokens": 240214406.0, + "step": 834 + }, + { + "epoch": 0.29741763134461263, + "grad_norm": 0.6843145489692688, + "learning_rate": 1e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.755863219499588, + "num_tokens": 240510191.0, + "step": 835 + }, + { + "epoch": 0.2977738201246661, + "grad_norm": 0.6869252324104309, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.745522752404213, + "num_tokens": 240813606.0, + "step": 836 + }, + { + "epoch": 0.2981300089047195, + "grad_norm": 0.752911388874054, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7406427413225174, + "num_tokens": 241089235.0, + "step": 837 + }, + { + "epoch": 0.2984861976847729, + "grad_norm": 0.6923590302467346, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7681547999382019, + "num_tokens": 241370562.0, + "step": 838 + }, + { + "epoch": 0.29884238646482636, + "grad_norm": 0.7438693046569824, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7411793768405914, + "num_tokens": 241617600.0, + "step": 839 + }, + { + "epoch": 0.2991985752448798, + "grad_norm": 0.7250669002532959, + "learning_rate": 1e-06, + "loss": 0.762, + "mean_token_accuracy": 0.761219248175621, + "num_tokens": 241910832.0, + "step": 840 + }, + { + "epoch": 0.2995547640249332, + "grad_norm": 0.6753978133201599, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7600900530815125, + "num_tokens": 242229528.0, + "step": 841 + }, + { + "epoch": 0.29991095280498664, + "grad_norm": 0.7048127055168152, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7272664159536362, + "num_tokens": 242536711.0, + "step": 842 + }, + { + "epoch": 0.3002671415850401, + "grad_norm": 1.7273650169372559, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7449328750371933, + "num_tokens": 242853013.0, + "step": 843 + }, + { + "epoch": 0.3006233303650935, + "grad_norm": 0.709145188331604, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7694251984357834, + "num_tokens": 243152616.0, + "step": 844 + }, + { + "epoch": 0.3009795191451469, + "grad_norm": 0.6986688375473022, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.756277397274971, + "num_tokens": 243442096.0, + "step": 845 + }, + { + "epoch": 0.30133570792520037, + "grad_norm": 0.725128710269928, + "learning_rate": 1e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.750907301902771, + "num_tokens": 243708994.0, + "step": 846 + }, + { + "epoch": 0.30169189670525376, + "grad_norm": 0.678400993347168, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7575680315494537, + "num_tokens": 244018013.0, + "step": 847 + }, + { + "epoch": 0.3020480854853072, + "grad_norm": 0.7668299674987793, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7412239462137222, + "num_tokens": 244277366.0, + "step": 848 + }, + { + "epoch": 0.30240427426536065, + "grad_norm": 0.7141560912132263, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7627938240766525, + "num_tokens": 244571005.0, + "step": 849 + }, + { + "epoch": 0.3027604630454141, + "grad_norm": 0.6657665371894836, + "learning_rate": 1e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7671635895967484, + "num_tokens": 244905721.0, + "step": 850 + }, + { + "epoch": 0.3031166518254675, + "grad_norm": 0.6718693375587463, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7354791313409805, + "num_tokens": 245213161.0, + "step": 851 + }, + { + "epoch": 0.30347284060552093, + "grad_norm": 0.6846584677696228, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.754711776971817, + "num_tokens": 245486459.0, + "step": 852 + }, + { + "epoch": 0.3038290293855744, + "grad_norm": 0.7001514434814453, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.748623326420784, + "num_tokens": 245765657.0, + "step": 853 + }, + { + "epoch": 0.30418521816562777, + "grad_norm": 0.668009340763092, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.747431144118309, + "num_tokens": 246079119.0, + "step": 854 + }, + { + "epoch": 0.3045414069456812, + "grad_norm": 0.6678745746612549, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7611160576343536, + "num_tokens": 246388321.0, + "step": 855 + }, + { + "epoch": 0.30489759572573466, + "grad_norm": 0.7219774723052979, + "learning_rate": 1e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.758843332529068, + "num_tokens": 246672754.0, + "step": 856 + }, + { + "epoch": 0.30525378450578805, + "grad_norm": 0.7443761229515076, + "learning_rate": 1e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7493714839220047, + "num_tokens": 246934500.0, + "step": 857 + }, + { + "epoch": 0.3056099732858415, + "grad_norm": 0.7347242832183838, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.739608108997345, + "num_tokens": 247195661.0, + "step": 858 + }, + { + "epoch": 0.30596616206589494, + "grad_norm": 0.6807264089584351, + "learning_rate": 1e-06, + "loss": 0.7662, + "mean_token_accuracy": 0.7554998248815536, + "num_tokens": 247532376.0, + "step": 859 + }, + { + "epoch": 0.30632235084594833, + "grad_norm": 0.6941683292388916, + "learning_rate": 1e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7553754597902298, + "num_tokens": 247812609.0, + "step": 860 + }, + { + "epoch": 0.3066785396260018, + "grad_norm": 0.7156988382339478, + "learning_rate": 1e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7645259648561478, + "num_tokens": 248071205.0, + "step": 861 + }, + { + "epoch": 0.3070347284060552, + "grad_norm": 0.6677640080451965, + "learning_rate": 1e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7613339126110077, + "num_tokens": 248363369.0, + "step": 862 + }, + { + "epoch": 0.3073909171861086, + "grad_norm": 0.6616682410240173, + "learning_rate": 1e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7580566257238388, + "num_tokens": 248687835.0, + "step": 863 + }, + { + "epoch": 0.30774710596616206, + "grad_norm": 0.7180230617523193, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7627349942922592, + "num_tokens": 248962340.0, + "step": 864 + }, + { + "epoch": 0.3081032947462155, + "grad_norm": 0.7345540523529053, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7422923892736435, + "num_tokens": 249221168.0, + "step": 865 + }, + { + "epoch": 0.3084594835262689, + "grad_norm": 0.637608528137207, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7750955522060394, + "num_tokens": 249526659.0, + "step": 866 + }, + { + "epoch": 0.30881567230632234, + "grad_norm": 0.699893057346344, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7499500066041946, + "num_tokens": 249806115.0, + "step": 867 + }, + { + "epoch": 0.3091718610863758, + "grad_norm": 0.701729953289032, + "learning_rate": 1e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7572732269763947, + "num_tokens": 250070824.0, + "step": 868 + }, + { + "epoch": 0.30952804986642923, + "grad_norm": 0.6971397399902344, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7625914216041565, + "num_tokens": 250354325.0, + "step": 869 + }, + { + "epoch": 0.3098842386464826, + "grad_norm": 0.6967325210571289, + "learning_rate": 1e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7618680745363235, + "num_tokens": 250672394.0, + "step": 870 + }, + { + "epoch": 0.31024042742653607, + "grad_norm": 0.6822255253791809, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7363189309835434, + "num_tokens": 250969506.0, + "step": 871 + }, + { + "epoch": 0.3105966162065895, + "grad_norm": 0.6882988810539246, + "learning_rate": 1e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.7510389238595963, + "num_tokens": 251270537.0, + "step": 872 + }, + { + "epoch": 0.3109528049866429, + "grad_norm": 0.7197768092155457, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7482335865497589, + "num_tokens": 251557720.0, + "step": 873 + }, + { + "epoch": 0.31130899376669635, + "grad_norm": 0.6775107979774475, + "learning_rate": 1e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7443195432424545, + "num_tokens": 251867152.0, + "step": 874 + }, + { + "epoch": 0.3116651825467498, + "grad_norm": 0.7455785870552063, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7403659969568253, + "num_tokens": 252148735.0, + "step": 875 + }, + { + "epoch": 0.3120213713268032, + "grad_norm": 0.6805530786514282, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7426274120807648, + "num_tokens": 252457949.0, + "step": 876 + }, + { + "epoch": 0.31237756010685663, + "grad_norm": 0.6924580335617065, + "learning_rate": 1e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.7668676525354385, + "num_tokens": 252766661.0, + "step": 877 + }, + { + "epoch": 0.3127337488869101, + "grad_norm": 0.6584977507591248, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7630959004163742, + "num_tokens": 253069791.0, + "step": 878 + }, + { + "epoch": 0.31308993766696347, + "grad_norm": 0.699175238609314, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7184184789657593, + "num_tokens": 253331941.0, + "step": 879 + }, + { + "epoch": 0.3134461264470169, + "grad_norm": 0.7753520607948303, + "learning_rate": 1e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.752400204539299, + "num_tokens": 253634426.0, + "step": 880 + }, + { + "epoch": 0.31380231522707036, + "grad_norm": 0.7499920725822449, + "learning_rate": 1e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7607174664735794, + "num_tokens": 253906015.0, + "step": 881 + }, + { + "epoch": 0.31415850400712375, + "grad_norm": 0.7561964392662048, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7419963032007217, + "num_tokens": 254161324.0, + "step": 882 + }, + { + "epoch": 0.3145146927871772, + "grad_norm": 0.711694061756134, + "learning_rate": 1e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7581741809844971, + "num_tokens": 254447409.0, + "step": 883 + }, + { + "epoch": 0.31487088156723064, + "grad_norm": 0.6882364153862, + "learning_rate": 1e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7612715810537338, + "num_tokens": 254781431.0, + "step": 884 + }, + { + "epoch": 0.31522707034728403, + "grad_norm": 0.675669252872467, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7403454482555389, + "num_tokens": 255095150.0, + "step": 885 + }, + { + "epoch": 0.3155832591273375, + "grad_norm": 0.6885696053504944, + "learning_rate": 1e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7463081926107407, + "num_tokens": 255390672.0, + "step": 886 + }, + { + "epoch": 0.3159394479073909, + "grad_norm": 0.7374991178512573, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7384773790836334, + "num_tokens": 255661976.0, + "step": 887 + }, + { + "epoch": 0.31629563668744437, + "grad_norm": 0.7295774221420288, + "learning_rate": 1e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.7649404406547546, + "num_tokens": 255918972.0, + "step": 888 + }, + { + "epoch": 0.31665182546749776, + "grad_norm": 0.6996410489082336, + "learning_rate": 1e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7449304908514023, + "num_tokens": 256214533.0, + "step": 889 + }, + { + "epoch": 0.3170080142475512, + "grad_norm": 0.7091266512870789, + "learning_rate": 1e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.7576732933521271, + "num_tokens": 256480754.0, + "step": 890 + }, + { + "epoch": 0.31736420302760465, + "grad_norm": 0.7041316032409668, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7564043551683426, + "num_tokens": 256777042.0, + "step": 891 + }, + { + "epoch": 0.31772039180765804, + "grad_norm": 0.7239974141120911, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7493351250886917, + "num_tokens": 257061142.0, + "step": 892 + }, + { + "epoch": 0.3180765805877115, + "grad_norm": 0.697701632976532, + "learning_rate": 1e-06, + "loss": 0.8031, + "mean_token_accuracy": 0.751897543668747, + "num_tokens": 257362130.0, + "step": 893 + }, + { + "epoch": 0.31843276936776493, + "grad_norm": 0.7013831734657288, + "learning_rate": 1e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7462702095508575, + "num_tokens": 257654224.0, + "step": 894 + }, + { + "epoch": 0.3187889581478183, + "grad_norm": 0.6960598230361938, + "learning_rate": 1e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7528339326381683, + "num_tokens": 257952000.0, + "step": 895 + }, + { + "epoch": 0.31914514692787177, + "grad_norm": 0.7344940304756165, + "learning_rate": 1e-06, + "loss": 0.7905, + "mean_token_accuracy": 0.7524381577968597, + "num_tokens": 258209536.0, + "step": 896 + }, + { + "epoch": 0.3195013357079252, + "grad_norm": 0.7291892170906067, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7404907792806625, + "num_tokens": 258504416.0, + "step": 897 + }, + { + "epoch": 0.3198575244879786, + "grad_norm": 0.6907812356948853, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7415270656347275, + "num_tokens": 258774965.0, + "step": 898 + }, + { + "epoch": 0.32021371326803205, + "grad_norm": 0.7184278964996338, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7453598380088806, + "num_tokens": 259042691.0, + "step": 899 + }, + { + "epoch": 0.3205699020480855, + "grad_norm": 0.6464451551437378, + "learning_rate": 1e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.7764433324337006, + "num_tokens": 259370862.0, + "step": 900 + }, + { + "epoch": 0.3209260908281389, + "grad_norm": 0.6892960071563721, + "learning_rate": 1e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7533179670572281, + "num_tokens": 259644773.0, + "step": 901 + }, + { + "epoch": 0.32128227960819233, + "grad_norm": 0.7009636163711548, + "learning_rate": 1e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7588440030813217, + "num_tokens": 259906620.0, + "step": 902 + }, + { + "epoch": 0.3216384683882458, + "grad_norm": 0.6810518503189087, + "learning_rate": 1e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.747457891702652, + "num_tokens": 260208950.0, + "step": 903 + }, + { + "epoch": 0.3219946571682992, + "grad_norm": 0.6717895865440369, + "learning_rate": 1e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7565653622150421, + "num_tokens": 260522622.0, + "step": 904 + }, + { + "epoch": 0.3223508459483526, + "grad_norm": 0.727199375629425, + "learning_rate": 1e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7552250772714615, + "num_tokens": 260795313.0, + "step": 905 + }, + { + "epoch": 0.32270703472840606, + "grad_norm": 0.6559717059135437, + "learning_rate": 1e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.75685915350914, + "num_tokens": 261096959.0, + "step": 906 + }, + { + "epoch": 0.3230632235084595, + "grad_norm": 0.7432389259338379, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7486508935689926, + "num_tokens": 261391182.0, + "step": 907 + }, + { + "epoch": 0.3234194122885129, + "grad_norm": 0.674475371837616, + "learning_rate": 1e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7559934705495834, + "num_tokens": 261690664.0, + "step": 908 + }, + { + "epoch": 0.32377560106856634, + "grad_norm": 0.6811354160308838, + "learning_rate": 1e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7411022037267685, + "num_tokens": 262016516.0, + "step": 909 + }, + { + "epoch": 0.3241317898486198, + "grad_norm": 0.6726001501083374, + "learning_rate": 1e-06, + "loss": 0.7954, + "mean_token_accuracy": 0.7492468506097794, + "num_tokens": 262337260.0, + "step": 910 + }, + { + "epoch": 0.3244879786286732, + "grad_norm": 0.7182547450065613, + "learning_rate": 1e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7589094936847687, + "num_tokens": 262604298.0, + "step": 911 + }, + { + "epoch": 0.3248441674087266, + "grad_norm": 0.7016334533691406, + "learning_rate": 1e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7465712279081345, + "num_tokens": 262895317.0, + "step": 912 + }, + { + "epoch": 0.32520035618878007, + "grad_norm": 0.6790099740028381, + "learning_rate": 1e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7804080247879028, + "num_tokens": 263186630.0, + "step": 913 + }, + { + "epoch": 0.32555654496883346, + "grad_norm": 0.6642248034477234, + "learning_rate": 1e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7540160417556763, + "num_tokens": 263511729.0, + "step": 914 + }, + { + "epoch": 0.3259127337488869, + "grad_norm": 0.7061387300491333, + "learning_rate": 1e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7591021060943604, + "num_tokens": 263772896.0, + "step": 915 + }, + { + "epoch": 0.32626892252894035, + "grad_norm": 0.7388876080513, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7425030171871185, + "num_tokens": 264049985.0, + "step": 916 + }, + { + "epoch": 0.32662511130899374, + "grad_norm": 0.7094252705574036, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7644518613815308, + "num_tokens": 264321781.0, + "step": 917 + }, + { + "epoch": 0.3269813000890472, + "grad_norm": 0.7235174775123596, + "learning_rate": 1e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.7601426690816879, + "num_tokens": 264614657.0, + "step": 918 + }, + { + "epoch": 0.32733748886910063, + "grad_norm": 0.6464847326278687, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7455369234085083, + "num_tokens": 264919509.0, + "step": 919 + }, + { + "epoch": 0.327693677649154, + "grad_norm": 0.6889439821243286, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7344417572021484, + "num_tokens": 265212368.0, + "step": 920 + }, + { + "epoch": 0.32804986642920747, + "grad_norm": 0.6966282725334167, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.739699512720108, + "num_tokens": 265524462.0, + "step": 921 + }, + { + "epoch": 0.3284060552092609, + "grad_norm": 0.6895467638969421, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7481822818517685, + "num_tokens": 265804657.0, + "step": 922 + }, + { + "epoch": 0.32876224398931436, + "grad_norm": 0.6910929083824158, + "learning_rate": 1e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7681866139173508, + "num_tokens": 266084545.0, + "step": 923 + }, + { + "epoch": 0.32911843276936775, + "grad_norm": 0.635198712348938, + "learning_rate": 1e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.7773841321468353, + "num_tokens": 266406673.0, + "step": 924 + }, + { + "epoch": 0.3294746215494212, + "grad_norm": 0.7144633531570435, + "learning_rate": 1e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7627619653940201, + "num_tokens": 266707724.0, + "step": 925 + }, + { + "epoch": 0.32983081032947464, + "grad_norm": 0.6677945852279663, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7402786910533905, + "num_tokens": 267013908.0, + "step": 926 + }, + { + "epoch": 0.33018699910952803, + "grad_norm": 0.7814738154411316, + "learning_rate": 1e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7666203826665878, + "num_tokens": 267261825.0, + "step": 927 + }, + { + "epoch": 0.3305431878895815, + "grad_norm": 0.6751267313957214, + "learning_rate": 1e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7659773975610733, + "num_tokens": 267545677.0, + "step": 928 + }, + { + "epoch": 0.3308993766696349, + "grad_norm": 0.676466166973114, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7615858763456345, + "num_tokens": 267846932.0, + "step": 929 + }, + { + "epoch": 0.3312555654496883, + "grad_norm": 0.7225381135940552, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7634077370166779, + "num_tokens": 268140074.0, + "step": 930 + }, + { + "epoch": 0.33161175422974176, + "grad_norm": 0.6660358905792236, + "learning_rate": 1e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7738670408725739, + "num_tokens": 268417120.0, + "step": 931 + }, + { + "epoch": 0.3319679430097952, + "grad_norm": 0.6904882192611694, + "learning_rate": 1e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.7576466500759125, + "num_tokens": 268715912.0, + "step": 932 + }, + { + "epoch": 0.3323241317898486, + "grad_norm": 0.6903769969940186, + "learning_rate": 1e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.7616925835609436, + "num_tokens": 269008334.0, + "step": 933 + }, + { + "epoch": 0.33268032056990204, + "grad_norm": 0.6420634984970093, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7680698186159134, + "num_tokens": 269321299.0, + "step": 934 + }, + { + "epoch": 0.3330365093499555, + "grad_norm": 0.6877120733261108, + "learning_rate": 1e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.7526570707559586, + "num_tokens": 269614785.0, + "step": 935 + }, + { + "epoch": 0.3333926981300089, + "grad_norm": 0.6552689075469971, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7292652726173401, + "num_tokens": 269930071.0, + "step": 936 + }, + { + "epoch": 0.3337488869100623, + "grad_norm": 0.6763952374458313, + "learning_rate": 1e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7524190843105316, + "num_tokens": 270224181.0, + "step": 937 + }, + { + "epoch": 0.33410507569011577, + "grad_norm": 1.137052059173584, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7562014609575272, + "num_tokens": 270501922.0, + "step": 938 + }, + { + "epoch": 0.3344612644701692, + "grad_norm": 0.6841080784797668, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7528908550739288, + "num_tokens": 270774730.0, + "step": 939 + }, + { + "epoch": 0.3348174532502226, + "grad_norm": 0.6947344541549683, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.742910236120224, + "num_tokens": 271064955.0, + "step": 940 + }, + { + "epoch": 0.33517364203027605, + "grad_norm": 0.7255611419677734, + "learning_rate": 1e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7567681819200516, + "num_tokens": 271332172.0, + "step": 941 + }, + { + "epoch": 0.3355298308103295, + "grad_norm": 0.6490383148193359, + "learning_rate": 1e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7636667340993881, + "num_tokens": 271632811.0, + "step": 942 + }, + { + "epoch": 0.3358860195903829, + "grad_norm": 0.731758177280426, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7492655962705612, + "num_tokens": 271904566.0, + "step": 943 + }, + { + "epoch": 0.33624220837043634, + "grad_norm": 0.7096821069717407, + "learning_rate": 1e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.771147832274437, + "num_tokens": 272172161.0, + "step": 944 + }, + { + "epoch": 0.3365983971504898, + "grad_norm": 0.7136046886444092, + "learning_rate": 1e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7656608819961548, + "num_tokens": 272443616.0, + "step": 945 + }, + { + "epoch": 0.33695458593054317, + "grad_norm": 0.7256335616111755, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7521230131387711, + "num_tokens": 272711331.0, + "step": 946 + }, + { + "epoch": 0.3373107747105966, + "grad_norm": 0.7299630641937256, + "learning_rate": 1e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7522657066583633, + "num_tokens": 272973911.0, + "step": 947 + }, + { + "epoch": 0.33766696349065006, + "grad_norm": 0.6721721291542053, + "learning_rate": 1e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7549170404672623, + "num_tokens": 273279105.0, + "step": 948 + }, + { + "epoch": 0.33802315227070345, + "grad_norm": 0.7277474999427795, + "learning_rate": 1e-06, + "loss": 0.828, + "mean_token_accuracy": 0.7419050484895706, + "num_tokens": 273540735.0, + "step": 949 + }, + { + "epoch": 0.3383793410507569, + "grad_norm": 0.6745789647102356, + "learning_rate": 1e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7749924212694168, + "num_tokens": 273827982.0, + "step": 950 + }, + { + "epoch": 0.33873552983081034, + "grad_norm": 0.6642516851425171, + "learning_rate": 1e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.7550855278968811, + "num_tokens": 274133015.0, + "step": 951 + }, + { + "epoch": 0.33909171861086373, + "grad_norm": 0.7423582077026367, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.744803249835968, + "num_tokens": 274390487.0, + "step": 952 + }, + { + "epoch": 0.3394479073909172, + "grad_norm": 0.678476095199585, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7467830330133438, + "num_tokens": 274678733.0, + "step": 953 + }, + { + "epoch": 0.3398040961709706, + "grad_norm": 0.6573756337165833, + "learning_rate": 1e-06, + "loss": 0.8087, + "mean_token_accuracy": 0.7501372247934341, + "num_tokens": 275008253.0, + "step": 954 + }, + { + "epoch": 0.340160284951024, + "grad_norm": 0.7303286194801331, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7418164163827896, + "num_tokens": 275281796.0, + "step": 955 + }, + { + "epoch": 0.34051647373107746, + "grad_norm": 0.7096561789512634, + "learning_rate": 1e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.748378649353981, + "num_tokens": 275539803.0, + "step": 956 + }, + { + "epoch": 0.3408726625111309, + "grad_norm": 0.6861266493797302, + "learning_rate": 1e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7638753205537796, + "num_tokens": 275817064.0, + "step": 957 + }, + { + "epoch": 0.34122885129118435, + "grad_norm": 0.6595292091369629, + "learning_rate": 1e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.7674876153469086, + "num_tokens": 276124139.0, + "step": 958 + }, + { + "epoch": 0.34158504007123774, + "grad_norm": 0.6762322187423706, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7481222003698349, + "num_tokens": 276426047.0, + "step": 959 + }, + { + "epoch": 0.3419412288512912, + "grad_norm": 0.7026257514953613, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7398896515369415, + "num_tokens": 276706491.0, + "step": 960 + }, + { + "epoch": 0.34229741763134464, + "grad_norm": 0.6775445938110352, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7482015639543533, + "num_tokens": 277002062.0, + "step": 961 + }, + { + "epoch": 0.342653606411398, + "grad_norm": 0.7211707234382629, + "learning_rate": 1e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.7519889026880264, + "num_tokens": 277307284.0, + "step": 962 + }, + { + "epoch": 0.34300979519145147, + "grad_norm": 0.707595705986023, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7361809313297272, + "num_tokens": 277614832.0, + "step": 963 + }, + { + "epoch": 0.3433659839715049, + "grad_norm": 0.6873271465301514, + "learning_rate": 1e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7555588632822037, + "num_tokens": 277888527.0, + "step": 964 + }, + { + "epoch": 0.3437221727515583, + "grad_norm": 0.7075751423835754, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.727765366435051, + "num_tokens": 278174572.0, + "step": 965 + }, + { + "epoch": 0.34407836153161175, + "grad_norm": 0.6854116320610046, + "learning_rate": 1e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7546065002679825, + "num_tokens": 278476185.0, + "step": 966 + }, + { + "epoch": 0.3444345503116652, + "grad_norm": 0.6688453555107117, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7482503652572632, + "num_tokens": 278792834.0, + "step": 967 + }, + { + "epoch": 0.3447907390917186, + "grad_norm": 0.6962181329727173, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7355585247278214, + "num_tokens": 279094279.0, + "step": 968 + }, + { + "epoch": 0.34514692787177204, + "grad_norm": 0.6789394617080688, + "learning_rate": 1e-06, + "loss": 0.767, + "mean_token_accuracy": 0.7579223215579987, + "num_tokens": 279399576.0, + "step": 969 + }, + { + "epoch": 0.3455031166518255, + "grad_norm": 0.7233515977859497, + "learning_rate": 1e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7480160892009735, + "num_tokens": 279677592.0, + "step": 970 + }, + { + "epoch": 0.34585930543187887, + "grad_norm": 0.657217800617218, + "learning_rate": 1e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.7643657177686691, + "num_tokens": 279979722.0, + "step": 971 + }, + { + "epoch": 0.3462154942119323, + "grad_norm": 0.7292103171348572, + "learning_rate": 1e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7436488568782806, + "num_tokens": 280257856.0, + "step": 972 + }, + { + "epoch": 0.34657168299198576, + "grad_norm": 0.6996052861213684, + "learning_rate": 1e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7663536667823792, + "num_tokens": 280573820.0, + "step": 973 + }, + { + "epoch": 0.34692787177203915, + "grad_norm": 0.7231012582778931, + "learning_rate": 1e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7515165954828262, + "num_tokens": 280834024.0, + "step": 974 + }, + { + "epoch": 0.3472840605520926, + "grad_norm": 0.7002610564231873, + "learning_rate": 1e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7481623589992523, + "num_tokens": 281131757.0, + "step": 975 + }, + { + "epoch": 0.34764024933214605, + "grad_norm": 0.691470205783844, + "learning_rate": 1e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7636671811342239, + "num_tokens": 281434411.0, + "step": 976 + }, + { + "epoch": 0.3479964381121995, + "grad_norm": 0.7002783417701721, + "learning_rate": 1e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.7532222270965576, + "num_tokens": 281686533.0, + "step": 977 + }, + { + "epoch": 0.3483526268922529, + "grad_norm": 0.676359236240387, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7335332781076431, + "num_tokens": 281996626.0, + "step": 978 + }, + { + "epoch": 0.3487088156723063, + "grad_norm": 0.668714702129364, + "learning_rate": 1e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7664030492305756, + "num_tokens": 282330011.0, + "step": 979 + }, + { + "epoch": 0.3490650044523598, + "grad_norm": 0.6790971755981445, + "learning_rate": 1e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7695025503635406, + "num_tokens": 282600963.0, + "step": 980 + }, + { + "epoch": 0.34942119323241316, + "grad_norm": 0.683207094669342, + "learning_rate": 1e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7494850605726242, + "num_tokens": 282877822.0, + "step": 981 + }, + { + "epoch": 0.3497773820124666, + "grad_norm": 0.7416868209838867, + "learning_rate": 1e-06, + "loss": 0.7952, + "mean_token_accuracy": 0.7490220218896866, + "num_tokens": 283122864.0, + "step": 982 + }, + { + "epoch": 0.35013357079252005, + "grad_norm": 0.6529122591018677, + "learning_rate": 1e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7685900330543518, + "num_tokens": 283419472.0, + "step": 983 + }, + { + "epoch": 0.35048975957257344, + "grad_norm": 0.6928706169128418, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7565998733043671, + "num_tokens": 283721228.0, + "step": 984 + }, + { + "epoch": 0.3508459483526269, + "grad_norm": 0.6847198009490967, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7613526284694672, + "num_tokens": 284021832.0, + "step": 985 + }, + { + "epoch": 0.35120213713268034, + "grad_norm": 0.7461682558059692, + "learning_rate": 1e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7723182588815689, + "num_tokens": 284302403.0, + "step": 986 + }, + { + "epoch": 0.3515583259127337, + "grad_norm": 0.6975488066673279, + "learning_rate": 1e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7456476837396622, + "num_tokens": 284589685.0, + "step": 987 + }, + { + "epoch": 0.3519145146927872, + "grad_norm": 0.7210144996643066, + "learning_rate": 1e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7549790143966675, + "num_tokens": 284851201.0, + "step": 988 + }, + { + "epoch": 0.3522707034728406, + "grad_norm": 0.7220459580421448, + "learning_rate": 1e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7624691724777222, + "num_tokens": 285109928.0, + "step": 989 + }, + { + "epoch": 0.352626892252894, + "grad_norm": 0.6669437289237976, + "learning_rate": 1e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7413257509469986, + "num_tokens": 285405371.0, + "step": 990 + }, + { + "epoch": 0.35298308103294745, + "grad_norm": 0.6808003783226013, + "learning_rate": 1e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7596741914749146, + "num_tokens": 285692980.0, + "step": 991 + }, + { + "epoch": 0.3533392698130009, + "grad_norm": 0.6827493906021118, + "learning_rate": 1e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7667705565690994, + "num_tokens": 286004040.0, + "step": 992 + }, + { + "epoch": 0.35369545859305435, + "grad_norm": 0.6854224801063538, + "learning_rate": 1e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7523939460515976, + "num_tokens": 286293132.0, + "step": 993 + }, + { + "epoch": 0.35405164737310774, + "grad_norm": 0.751265823841095, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7615468204021454, + "num_tokens": 286562644.0, + "step": 994 + }, + { + "epoch": 0.3544078361531612, + "grad_norm": 0.6911643743515015, + "learning_rate": 1e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7647616267204285, + "num_tokens": 286850979.0, + "step": 995 + }, + { + "epoch": 0.35476402493321463, + "grad_norm": 0.691410481929779, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7576535046100616, + "num_tokens": 287130896.0, + "step": 996 + }, + { + "epoch": 0.355120213713268, + "grad_norm": 0.6747599840164185, + "learning_rate": 1e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7641595155000687, + "num_tokens": 287419265.0, + "step": 997 + }, + { + "epoch": 0.35547640249332146, + "grad_norm": 0.7338981032371521, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7412437945604324, + "num_tokens": 287685891.0, + "step": 998 + }, + { + "epoch": 0.3558325912733749, + "grad_norm": 0.7233175039291382, + "learning_rate": 1e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7607902884483337, + "num_tokens": 287968242.0, + "step": 999 + }, + { + "epoch": 0.3561887800534283, + "grad_norm": 0.6967154741287231, + "learning_rate": 1e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7511941194534302, + "num_tokens": 288244556.0, + "step": 1000 + }, + { + "epoch": 0.35654496883348175, + "grad_norm": 0.6586833596229553, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7523767948150635, + "num_tokens": 288567034.0, + "step": 1001 + }, + { + "epoch": 0.3569011576135352, + "grad_norm": 0.7358320355415344, + "learning_rate": 1e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7411203980445862, + "num_tokens": 288830766.0, + "step": 1002 + }, + { + "epoch": 0.3572573463935886, + "grad_norm": 0.6700771450996399, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7551371306180954, + "num_tokens": 289126751.0, + "step": 1003 + }, + { + "epoch": 0.357613535173642, + "grad_norm": 0.697033166885376, + "learning_rate": 1e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.7576008290052414, + "num_tokens": 289415149.0, + "step": 1004 + }, + { + "epoch": 0.3579697239536955, + "grad_norm": 0.7068910598754883, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7517503947019577, + "num_tokens": 289678867.0, + "step": 1005 + }, + { + "epoch": 0.35832591273374886, + "grad_norm": 0.6499239206314087, + "learning_rate": 1e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.76632459461689, + "num_tokens": 289989398.0, + "step": 1006 + }, + { + "epoch": 0.3586821015138023, + "grad_norm": 0.711359977722168, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7405799925327301, + "num_tokens": 290250250.0, + "step": 1007 + }, + { + "epoch": 0.35903829029385576, + "grad_norm": 0.7035178542137146, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7312429249286652, + "num_tokens": 290544847.0, + "step": 1008 + }, + { + "epoch": 0.35939447907390915, + "grad_norm": 0.726648211479187, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7583761811256409, + "num_tokens": 290811695.0, + "step": 1009 + }, + { + "epoch": 0.3597506678539626, + "grad_norm": 0.6816009283065796, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7462455779314041, + "num_tokens": 291133868.0, + "step": 1010 + }, + { + "epoch": 0.36010685663401604, + "grad_norm": 0.658146321773529, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7552063912153244, + "num_tokens": 291448716.0, + "step": 1011 + }, + { + "epoch": 0.3604630454140695, + "grad_norm": 0.6808111667633057, + "learning_rate": 1e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7637453824281693, + "num_tokens": 291751305.0, + "step": 1012 + }, + { + "epoch": 0.3608192341941229, + "grad_norm": 0.7308806777000427, + "learning_rate": 1e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7592238485813141, + "num_tokens": 292029694.0, + "step": 1013 + }, + { + "epoch": 0.3611754229741763, + "grad_norm": 0.708733856678009, + "learning_rate": 1e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7563372701406479, + "num_tokens": 292314356.0, + "step": 1014 + }, + { + "epoch": 0.36153161175422976, + "grad_norm": 0.7176215648651123, + "learning_rate": 1e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7683443874120712, + "num_tokens": 292585059.0, + "step": 1015 + }, + { + "epoch": 0.36188780053428315, + "grad_norm": 0.7011387944221497, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.771283283829689, + "num_tokens": 292861613.0, + "step": 1016 + }, + { + "epoch": 0.3622439893143366, + "grad_norm": 0.6628348231315613, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7602010667324066, + "num_tokens": 293158259.0, + "step": 1017 + }, + { + "epoch": 0.36260017809439005, + "grad_norm": 0.7048265337944031, + "learning_rate": 1e-06, + "loss": 0.768, + "mean_token_accuracy": 0.7599413096904755, + "num_tokens": 293450109.0, + "step": 1018 + }, + { + "epoch": 0.36295636687444344, + "grad_norm": 0.6773717999458313, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7453852593898773, + "num_tokens": 293755636.0, + "step": 1019 + }, + { + "epoch": 0.3633125556544969, + "grad_norm": 0.6900895833969116, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7465929090976715, + "num_tokens": 294032933.0, + "step": 1020 + }, + { + "epoch": 0.36366874443455033, + "grad_norm": 0.6773861050605774, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7474559843540192, + "num_tokens": 294336728.0, + "step": 1021 + }, + { + "epoch": 0.3640249332146037, + "grad_norm": 0.741661012172699, + "learning_rate": 1e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7669714093208313, + "num_tokens": 294598221.0, + "step": 1022 + }, + { + "epoch": 0.36438112199465716, + "grad_norm": 0.7195872664451599, + "learning_rate": 1e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7592649012804031, + "num_tokens": 294876838.0, + "step": 1023 + }, + { + "epoch": 0.3647373107747106, + "grad_norm": 0.6801986694335938, + "learning_rate": 1e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7552238404750824, + "num_tokens": 295151939.0, + "step": 1024 + }, + { + "epoch": 0.365093499554764, + "grad_norm": 0.6731950044631958, + "learning_rate": 1e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7642020434141159, + "num_tokens": 295427447.0, + "step": 1025 + }, + { + "epoch": 0.36544968833481745, + "grad_norm": 0.7017679810523987, + "learning_rate": 1e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7342010587453842, + "num_tokens": 295704480.0, + "step": 1026 + }, + { + "epoch": 0.3658058771148709, + "grad_norm": 0.7031132578849792, + "learning_rate": 1e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7555843442678452, + "num_tokens": 295984261.0, + "step": 1027 + }, + { + "epoch": 0.3661620658949243, + "grad_norm": 0.7085416316986084, + "learning_rate": 1e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7465939074754715, + "num_tokens": 296255130.0, + "step": 1028 + }, + { + "epoch": 0.36651825467497773, + "grad_norm": 0.660037100315094, + "learning_rate": 1e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7626747041940689, + "num_tokens": 296554148.0, + "step": 1029 + }, + { + "epoch": 0.3668744434550312, + "grad_norm": 0.6941710710525513, + "learning_rate": 1e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7482791244983673, + "num_tokens": 296826902.0, + "step": 1030 + }, + { + "epoch": 0.3672306322350846, + "grad_norm": 0.6764687895774841, + "learning_rate": 1e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7668323367834091, + "num_tokens": 297107181.0, + "step": 1031 + }, + { + "epoch": 0.367586821015138, + "grad_norm": 0.6777805685997009, + "learning_rate": 1e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7497245073318481, + "num_tokens": 297407415.0, + "step": 1032 + }, + { + "epoch": 0.36794300979519146, + "grad_norm": 0.6947880983352661, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.743273988366127, + "num_tokens": 297689297.0, + "step": 1033 + }, + { + "epoch": 0.3682991985752449, + "grad_norm": 0.6611723303794861, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7626070827245712, + "num_tokens": 298002232.0, + "step": 1034 + }, + { + "epoch": 0.3686553873552983, + "grad_norm": 0.6518058180809021, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7584635615348816, + "num_tokens": 298292996.0, + "step": 1035 + }, + { + "epoch": 0.36901157613535174, + "grad_norm": 0.6908048391342163, + "learning_rate": 1e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7600978463888168, + "num_tokens": 298561987.0, + "step": 1036 + }, + { + "epoch": 0.3693677649154052, + "grad_norm": 0.6798720955848694, + "learning_rate": 1e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7523448765277863, + "num_tokens": 298859726.0, + "step": 1037 + }, + { + "epoch": 0.3697239536954586, + "grad_norm": 0.6494482755661011, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7688305377960205, + "num_tokens": 299171068.0, + "step": 1038 + }, + { + "epoch": 0.370080142475512, + "grad_norm": 0.6691728234291077, + "learning_rate": 1e-06, + "loss": 0.774, + "mean_token_accuracy": 0.764058068394661, + "num_tokens": 299470809.0, + "step": 1039 + }, + { + "epoch": 0.37043633125556547, + "grad_norm": 0.6800209879875183, + "learning_rate": 1e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.7454710602760315, + "num_tokens": 299777168.0, + "step": 1040 + }, + { + "epoch": 0.37079252003561886, + "grad_norm": 0.7093147039413452, + "learning_rate": 1e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7545020133256912, + "num_tokens": 300079894.0, + "step": 1041 + }, + { + "epoch": 0.3711487088156723, + "grad_norm": 0.6487396955490112, + "learning_rate": 1e-06, + "loss": 0.7169, + "mean_token_accuracy": 0.7700338959693909, + "num_tokens": 300398908.0, + "step": 1042 + }, + { + "epoch": 0.37150489759572575, + "grad_norm": 0.781402051448822, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.74246746301651, + "num_tokens": 300683955.0, + "step": 1043 + }, + { + "epoch": 0.37186108637577914, + "grad_norm": 0.6786559224128723, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7416154742240906, + "num_tokens": 300995000.0, + "step": 1044 + }, + { + "epoch": 0.3722172751558326, + "grad_norm": 0.6823394894599915, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7795024216175079, + "num_tokens": 301309625.0, + "step": 1045 + }, + { + "epoch": 0.37257346393588603, + "grad_norm": 0.6958190202713013, + "learning_rate": 1e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7574737668037415, + "num_tokens": 301587937.0, + "step": 1046 + }, + { + "epoch": 0.3729296527159395, + "grad_norm": 0.6679887175559998, + "learning_rate": 1e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7506357729434967, + "num_tokens": 301907959.0, + "step": 1047 + }, + { + "epoch": 0.37328584149599287, + "grad_norm": 0.7311645150184631, + "learning_rate": 1e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7521284967660904, + "num_tokens": 302168296.0, + "step": 1048 + }, + { + "epoch": 0.3736420302760463, + "grad_norm": 0.6927871704101562, + "learning_rate": 1e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7581268846988678, + "num_tokens": 302445807.0, + "step": 1049 + }, + { + "epoch": 0.37399821905609976, + "grad_norm": 0.662546694278717, + "learning_rate": 1e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7764098048210144, + "num_tokens": 302746400.0, + "step": 1050 + }, + { + "epoch": 0.37435440783615315, + "grad_norm": 0.719197154045105, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7490465044975281, + "num_tokens": 303033703.0, + "step": 1051 + }, + { + "epoch": 0.3747105966162066, + "grad_norm": 0.6941804885864258, + "learning_rate": 1e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7621138691902161, + "num_tokens": 303338462.0, + "step": 1052 + }, + { + "epoch": 0.37506678539626004, + "grad_norm": 0.6579605340957642, + "learning_rate": 1e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7396310269832611, + "num_tokens": 303641237.0, + "step": 1053 + }, + { + "epoch": 0.37542297417631343, + "grad_norm": 0.7211250066757202, + "learning_rate": 1e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7510149925947189, + "num_tokens": 303916162.0, + "step": 1054 + }, + { + "epoch": 0.3757791629563669, + "grad_norm": 0.6518275737762451, + "learning_rate": 1e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.75487320125103, + "num_tokens": 304227375.0, + "step": 1055 + }, + { + "epoch": 0.3761353517364203, + "grad_norm": 0.7089177966117859, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7442062646150589, + "num_tokens": 304517378.0, + "step": 1056 + }, + { + "epoch": 0.3764915405164737, + "grad_norm": 0.6763259172439575, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7478460371494293, + "num_tokens": 304815823.0, + "step": 1057 + }, + { + "epoch": 0.37684772929652716, + "grad_norm": 0.695899486541748, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7555164694786072, + "num_tokens": 305145037.0, + "step": 1058 + }, + { + "epoch": 0.3772039180765806, + "grad_norm": 0.6418567299842834, + "learning_rate": 1e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7657504230737686, + "num_tokens": 305464707.0, + "step": 1059 + }, + { + "epoch": 0.377560106856634, + "grad_norm": 0.756751298904419, + "learning_rate": 1e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7485355287790298, + "num_tokens": 305720974.0, + "step": 1060 + }, + { + "epoch": 0.37791629563668744, + "grad_norm": 0.7182579040527344, + "learning_rate": 1e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7497600317001343, + "num_tokens": 305978436.0, + "step": 1061 + }, + { + "epoch": 0.3782724844167409, + "grad_norm": 0.6872028708457947, + "learning_rate": 1e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7559042572975159, + "num_tokens": 306271830.0, + "step": 1062 + }, + { + "epoch": 0.3786286731967943, + "grad_norm": 0.7423546314239502, + "learning_rate": 1e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7576362788677216, + "num_tokens": 306540813.0, + "step": 1063 + }, + { + "epoch": 0.3789848619768477, + "grad_norm": 0.7116988897323608, + "learning_rate": 1e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7549264430999756, + "num_tokens": 306806071.0, + "step": 1064 + }, + { + "epoch": 0.37934105075690117, + "grad_norm": 0.6734312176704407, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7581920921802521, + "num_tokens": 307079345.0, + "step": 1065 + }, + { + "epoch": 0.3796972395369546, + "grad_norm": 0.7518114447593689, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7609548568725586, + "num_tokens": 307380899.0, + "step": 1066 + }, + { + "epoch": 0.380053428317008, + "grad_norm": 0.7126616835594177, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7433136999607086, + "num_tokens": 307675195.0, + "step": 1067 + }, + { + "epoch": 0.38040961709706145, + "grad_norm": 0.6421465277671814, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7572032660245895, + "num_tokens": 307994217.0, + "step": 1068 + }, + { + "epoch": 0.3807658058771149, + "grad_norm": 0.6742041110992432, + "learning_rate": 1e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7573107481002808, + "num_tokens": 308306086.0, + "step": 1069 + }, + { + "epoch": 0.3811219946571683, + "grad_norm": 0.7821462154388428, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7564298659563065, + "num_tokens": 308579396.0, + "step": 1070 + }, + { + "epoch": 0.38147818343722173, + "grad_norm": 0.6763910055160522, + "learning_rate": 1e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.7573541104793549, + "num_tokens": 308878606.0, + "step": 1071 + }, + { + "epoch": 0.3818343722172752, + "grad_norm": 0.6641843914985657, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7498129159212112, + "num_tokens": 309181583.0, + "step": 1072 + }, + { + "epoch": 0.38219056099732857, + "grad_norm": 0.733188271522522, + "learning_rate": 1e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.7447738498449326, + "num_tokens": 309446729.0, + "step": 1073 + }, + { + "epoch": 0.382546749777382, + "grad_norm": 0.747186005115509, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7389824390411377, + "num_tokens": 309735888.0, + "step": 1074 + }, + { + "epoch": 0.38290293855743546, + "grad_norm": 0.7028849720954895, + "learning_rate": 1e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7590918093919754, + "num_tokens": 310042573.0, + "step": 1075 + }, + { + "epoch": 0.38325912733748885, + "grad_norm": 0.6774846315383911, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7648516744375229, + "num_tokens": 310322501.0, + "step": 1076 + }, + { + "epoch": 0.3836153161175423, + "grad_norm": 0.670171856880188, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7491095215082169, + "num_tokens": 310640251.0, + "step": 1077 + }, + { + "epoch": 0.38397150489759574, + "grad_norm": 0.746183454990387, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7382780164480209, + "num_tokens": 310920055.0, + "step": 1078 + }, + { + "epoch": 0.38432769367764913, + "grad_norm": 0.6765603423118591, + "learning_rate": 1e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.7621345520019531, + "num_tokens": 311215366.0, + "step": 1079 + }, + { + "epoch": 0.3846838824577026, + "grad_norm": 0.6720067262649536, + "learning_rate": 1e-06, + "loss": 0.8002, + "mean_token_accuracy": 0.750799372792244, + "num_tokens": 311506470.0, + "step": 1080 + }, + { + "epoch": 0.385040071237756, + "grad_norm": 0.7076883912086487, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7324828058481216, + "num_tokens": 311815525.0, + "step": 1081 + }, + { + "epoch": 0.38539626001780947, + "grad_norm": 0.6588482856750488, + "learning_rate": 1e-06, + "loss": 0.775, + "mean_token_accuracy": 0.756848618388176, + "num_tokens": 312105390.0, + "step": 1082 + }, + { + "epoch": 0.38575244879786286, + "grad_norm": 0.6439084410667419, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7494002729654312, + "num_tokens": 312418752.0, + "step": 1083 + }, + { + "epoch": 0.3861086375779163, + "grad_norm": 0.6857344508171082, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7470608055591583, + "num_tokens": 312700463.0, + "step": 1084 + }, + { + "epoch": 0.38646482635796975, + "grad_norm": 0.6594713926315308, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7614190578460693, + "num_tokens": 312998699.0, + "step": 1085 + }, + { + "epoch": 0.38682101513802314, + "grad_norm": 0.6753360033035278, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7464402914047241, + "num_tokens": 313272495.0, + "step": 1086 + }, + { + "epoch": 0.3871772039180766, + "grad_norm": 0.7289642691612244, + "learning_rate": 1e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7525405585765839, + "num_tokens": 313553650.0, + "step": 1087 + }, + { + "epoch": 0.38753339269813003, + "grad_norm": 0.6977395415306091, + "learning_rate": 1e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7432194948196411, + "num_tokens": 313828654.0, + "step": 1088 + }, + { + "epoch": 0.3878895814781834, + "grad_norm": 0.6590979695320129, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7611910998821259, + "num_tokens": 314122131.0, + "step": 1089 + }, + { + "epoch": 0.38824577025823687, + "grad_norm": 0.7077186703681946, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7265421003103256, + "num_tokens": 314431004.0, + "step": 1090 + }, + { + "epoch": 0.3886019590382903, + "grad_norm": 0.695683479309082, + "learning_rate": 1e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7462074011564255, + "num_tokens": 314724199.0, + "step": 1091 + }, + { + "epoch": 0.3889581478183437, + "grad_norm": 0.6803678870201111, + "learning_rate": 1e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7486949563026428, + "num_tokens": 315009898.0, + "step": 1092 + }, + { + "epoch": 0.38931433659839715, + "grad_norm": 0.6822684407234192, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7304403334856033, + "num_tokens": 315308889.0, + "step": 1093 + }, + { + "epoch": 0.3896705253784506, + "grad_norm": 0.6963436603546143, + "learning_rate": 1e-06, + "loss": 0.7826, + "mean_token_accuracy": 0.7567585855722427, + "num_tokens": 315589127.0, + "step": 1094 + }, + { + "epoch": 0.390026714158504, + "grad_norm": 0.7303425073623657, + "learning_rate": 1e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7425284832715988, + "num_tokens": 315862789.0, + "step": 1095 + }, + { + "epoch": 0.39038290293855743, + "grad_norm": 0.6653121113777161, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7389765530824661, + "num_tokens": 316157558.0, + "step": 1096 + }, + { + "epoch": 0.3907390917186109, + "grad_norm": 0.7077389359474182, + "learning_rate": 1e-06, + "loss": 0.7905, + "mean_token_accuracy": 0.7553563565015793, + "num_tokens": 316431485.0, + "step": 1097 + }, + { + "epoch": 0.39109528049866427, + "grad_norm": 0.778586208820343, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7449379563331604, + "num_tokens": 316728470.0, + "step": 1098 + }, + { + "epoch": 0.3914514692787177, + "grad_norm": 0.7309492826461792, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7400401681661606, + "num_tokens": 316974129.0, + "step": 1099 + }, + { + "epoch": 0.39180765805877116, + "grad_norm": 0.7038516998291016, + "learning_rate": 1e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7544625401496887, + "num_tokens": 317272363.0, + "step": 1100 + }, + { + "epoch": 0.3921638468388246, + "grad_norm": 0.659795343875885, + "learning_rate": 1e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.772985428571701, + "num_tokens": 317573777.0, + "step": 1101 + }, + { + "epoch": 0.392520035618878, + "grad_norm": 0.6639795899391174, + "learning_rate": 1e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7671139240264893, + "num_tokens": 317920556.0, + "step": 1102 + }, + { + "epoch": 0.39287622439893144, + "grad_norm": 0.6788672208786011, + "learning_rate": 1e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7521893531084061, + "num_tokens": 318185117.0, + "step": 1103 + }, + { + "epoch": 0.3932324131789849, + "grad_norm": 0.7093088626861572, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7436272948980331, + "num_tokens": 318471788.0, + "step": 1104 + }, + { + "epoch": 0.3935886019590383, + "grad_norm": 0.6908660531044006, + "learning_rate": 1e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.760959267616272, + "num_tokens": 318771823.0, + "step": 1105 + }, + { + "epoch": 0.3939447907390917, + "grad_norm": 0.6808560490608215, + "learning_rate": 1e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.7573212087154388, + "num_tokens": 319065150.0, + "step": 1106 + }, + { + "epoch": 0.39430097951914517, + "grad_norm": 0.6487220525741577, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7697229236364365, + "num_tokens": 319372696.0, + "step": 1107 + }, + { + "epoch": 0.39465716829919856, + "grad_norm": 0.7044273614883423, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7556710094213486, + "num_tokens": 319661899.0, + "step": 1108 + }, + { + "epoch": 0.395013357079252, + "grad_norm": 0.7349939346313477, + "learning_rate": 1e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7543495446443558, + "num_tokens": 319931702.0, + "step": 1109 + }, + { + "epoch": 0.39536954585930545, + "grad_norm": 0.7420426607131958, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7388109117746353, + "num_tokens": 320206144.0, + "step": 1110 + }, + { + "epoch": 0.39572573463935884, + "grad_norm": 0.6669763326644897, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7652877569198608, + "num_tokens": 320514050.0, + "step": 1111 + }, + { + "epoch": 0.3960819234194123, + "grad_norm": 0.7177374958992004, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7476798743009567, + "num_tokens": 320789291.0, + "step": 1112 + }, + { + "epoch": 0.39643811219946573, + "grad_norm": 0.743962824344635, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7558945715427399, + "num_tokens": 321068567.0, + "step": 1113 + }, + { + "epoch": 0.3967943009795191, + "grad_norm": 0.7025802135467529, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.743312805891037, + "num_tokens": 321361784.0, + "step": 1114 + }, + { + "epoch": 0.39715048975957257, + "grad_norm": 0.6849380135536194, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7569027096033096, + "num_tokens": 321645038.0, + "step": 1115 + }, + { + "epoch": 0.397506678539626, + "grad_norm": 0.6932703256607056, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7596959620714188, + "num_tokens": 321938491.0, + "step": 1116 + }, + { + "epoch": 0.3978628673196794, + "grad_norm": 0.7199350595474243, + "learning_rate": 1e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.7552053332328796, + "num_tokens": 322198662.0, + "step": 1117 + }, + { + "epoch": 0.39821905609973285, + "grad_norm": 0.7038455605506897, + "learning_rate": 1e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7487005293369293, + "num_tokens": 322485380.0, + "step": 1118 + }, + { + "epoch": 0.3985752448797863, + "grad_norm": 0.7249865531921387, + "learning_rate": 1e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7596726715564728, + "num_tokens": 322780052.0, + "step": 1119 + }, + { + "epoch": 0.39893143365983974, + "grad_norm": 0.7039104104042053, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7519682347774506, + "num_tokens": 323063890.0, + "step": 1120 + }, + { + "epoch": 0.39928762243989313, + "grad_norm": 0.6295273303985596, + "learning_rate": 1e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7594645172357559, + "num_tokens": 323377901.0, + "step": 1121 + }, + { + "epoch": 0.3996438112199466, + "grad_norm": 0.6390991806983948, + "learning_rate": 1e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7534127831459045, + "num_tokens": 323680365.0, + "step": 1122 + }, + { + "epoch": 0.4, + "grad_norm": 0.6377549171447754, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7569628208875656, + "num_tokens": 323982408.0, + "step": 1123 + }, + { + "epoch": 0.4003561887800534, + "grad_norm": 0.6919654011726379, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7397087514400482, + "num_tokens": 324256897.0, + "step": 1124 + }, + { + "epoch": 0.40071237756010686, + "grad_norm": 0.7017194628715515, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7425800412893295, + "num_tokens": 324539695.0, + "step": 1125 + }, + { + "epoch": 0.4010685663401603, + "grad_norm": 0.6524563431739807, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7639384269714355, + "num_tokens": 324848581.0, + "step": 1126 + }, + { + "epoch": 0.4014247551202137, + "grad_norm": 0.681194543838501, + "learning_rate": 1e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7571572363376617, + "num_tokens": 325145218.0, + "step": 1127 + }, + { + "epoch": 0.40178094390026714, + "grad_norm": 0.6765927672386169, + "learning_rate": 1e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7450330853462219, + "num_tokens": 325432935.0, + "step": 1128 + }, + { + "epoch": 0.4021371326803206, + "grad_norm": 0.7047073841094971, + "learning_rate": 1e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.757760152220726, + "num_tokens": 325724417.0, + "step": 1129 + }, + { + "epoch": 0.402493321460374, + "grad_norm": 0.7388649582862854, + "learning_rate": 1e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.7462565153837204, + "num_tokens": 325969758.0, + "step": 1130 + }, + { + "epoch": 0.4028495102404274, + "grad_norm": 0.7005550861358643, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7422333359718323, + "num_tokens": 326227391.0, + "step": 1131 + }, + { + "epoch": 0.40320569902048087, + "grad_norm": 0.7038488388061523, + "learning_rate": 1e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7716998904943466, + "num_tokens": 326506320.0, + "step": 1132 + }, + { + "epoch": 0.40356188780053426, + "grad_norm": 0.6818111538887024, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7410069704055786, + "num_tokens": 326800017.0, + "step": 1133 + }, + { + "epoch": 0.4039180765805877, + "grad_norm": 0.6641862392425537, + "learning_rate": 1e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7457933723926544, + "num_tokens": 327098584.0, + "step": 1134 + }, + { + "epoch": 0.40427426536064115, + "grad_norm": 0.6889894008636475, + "learning_rate": 1e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7569723874330521, + "num_tokens": 327401418.0, + "step": 1135 + }, + { + "epoch": 0.4046304541406946, + "grad_norm": 0.6896346807479858, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7687571495771408, + "num_tokens": 327685815.0, + "step": 1136 + }, + { + "epoch": 0.404986642920748, + "grad_norm": 0.6916198134422302, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7493809759616852, + "num_tokens": 327988894.0, + "step": 1137 + }, + { + "epoch": 0.40534283170080143, + "grad_norm": 0.7097260355949402, + "learning_rate": 1e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7670486867427826, + "num_tokens": 328292698.0, + "step": 1138 + }, + { + "epoch": 0.4056990204808549, + "grad_norm": 0.6869788765907288, + "learning_rate": 1e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7792748063802719, + "num_tokens": 328583569.0, + "step": 1139 + }, + { + "epoch": 0.40605520926090827, + "grad_norm": 0.724714457988739, + "learning_rate": 1e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.7538985908031464, + "num_tokens": 328874424.0, + "step": 1140 + }, + { + "epoch": 0.4064113980409617, + "grad_norm": 0.669495165348053, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7496008723974228, + "num_tokens": 329177931.0, + "step": 1141 + }, + { + "epoch": 0.40676758682101516, + "grad_norm": 0.7514724135398865, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7622170746326447, + "num_tokens": 329446413.0, + "step": 1142 + }, + { + "epoch": 0.40712377560106855, + "grad_norm": 0.7841257452964783, + "learning_rate": 1e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7615542560815811, + "num_tokens": 329699471.0, + "step": 1143 + }, + { + "epoch": 0.407479964381122, + "grad_norm": 0.6681774258613586, + "learning_rate": 1e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7639775276184082, + "num_tokens": 329989150.0, + "step": 1144 + }, + { + "epoch": 0.40783615316117544, + "grad_norm": 0.6875301003456116, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.747034564614296, + "num_tokens": 330290881.0, + "step": 1145 + }, + { + "epoch": 0.40819234194122883, + "grad_norm": 0.7369912266731262, + "learning_rate": 1e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7520595788955688, + "num_tokens": 330577426.0, + "step": 1146 + }, + { + "epoch": 0.4085485307212823, + "grad_norm": 0.7353833913803101, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7402537912130356, + "num_tokens": 330844593.0, + "step": 1147 + }, + { + "epoch": 0.4089047195013357, + "grad_norm": 0.7225067615509033, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7622229158878326, + "num_tokens": 331107374.0, + "step": 1148 + }, + { + "epoch": 0.4092609082813891, + "grad_norm": 0.7431294918060303, + "learning_rate": 1e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7625291347503662, + "num_tokens": 331364731.0, + "step": 1149 + }, + { + "epoch": 0.40961709706144256, + "grad_norm": 0.7475316524505615, + "learning_rate": 1e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7508395463228226, + "num_tokens": 331640686.0, + "step": 1150 + }, + { + "epoch": 0.409973285841496, + "grad_norm": 0.7598409056663513, + "learning_rate": 1e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7542297393083572, + "num_tokens": 331905177.0, + "step": 1151 + }, + { + "epoch": 0.4103294746215494, + "grad_norm": 0.6896981000900269, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7718125730752945, + "num_tokens": 332199417.0, + "step": 1152 + }, + { + "epoch": 0.41068566340160284, + "grad_norm": 0.7023691534996033, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7425171285867691, + "num_tokens": 332495646.0, + "step": 1153 + }, + { + "epoch": 0.4110418521816563, + "grad_norm": 0.7812708020210266, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7254598289728165, + "num_tokens": 332749290.0, + "step": 1154 + }, + { + "epoch": 0.41139804096170973, + "grad_norm": 0.8021993041038513, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7376922219991684, + "num_tokens": 332981033.0, + "step": 1155 + }, + { + "epoch": 0.4117542297417631, + "grad_norm": 0.7471128702163696, + "learning_rate": 1e-06, + "loss": 0.7169, + "mean_token_accuracy": 0.7711766362190247, + "num_tokens": 333234419.0, + "step": 1156 + }, + { + "epoch": 0.41211041852181657, + "grad_norm": 0.7000200152397156, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7633799314498901, + "num_tokens": 333500870.0, + "step": 1157 + }, + { + "epoch": 0.41246660730187, + "grad_norm": 0.7142425775527954, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7458591610193253, + "num_tokens": 333796734.0, + "step": 1158 + }, + { + "epoch": 0.4128227960819234, + "grad_norm": 0.7170581221580505, + "learning_rate": 1e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.7587528079748154, + "num_tokens": 334087540.0, + "step": 1159 + }, + { + "epoch": 0.41317898486197685, + "grad_norm": 0.682434618473053, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7531097382307053, + "num_tokens": 334393659.0, + "step": 1160 + }, + { + "epoch": 0.4135351736420303, + "grad_norm": 0.72043377161026, + "learning_rate": 1e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7471151202917099, + "num_tokens": 334648574.0, + "step": 1161 + }, + { + "epoch": 0.4138913624220837, + "grad_norm": 0.680309534072876, + "learning_rate": 1e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7640863209962845, + "num_tokens": 334931244.0, + "step": 1162 + }, + { + "epoch": 0.41424755120213713, + "grad_norm": 0.712909460067749, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7521609514951706, + "num_tokens": 335218138.0, + "step": 1163 + }, + { + "epoch": 0.4146037399821906, + "grad_norm": 0.6973684430122375, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7529626190662384, + "num_tokens": 335508967.0, + "step": 1164 + }, + { + "epoch": 0.41495992876224397, + "grad_norm": 0.7049043774604797, + "learning_rate": 1e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7484369874000549, + "num_tokens": 335793530.0, + "step": 1165 + }, + { + "epoch": 0.4153161175422974, + "grad_norm": 0.7368618249893188, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7541572004556656, + "num_tokens": 336082571.0, + "step": 1166 + }, + { + "epoch": 0.41567230632235086, + "grad_norm": 0.7069539427757263, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7520934641361237, + "num_tokens": 336402845.0, + "step": 1167 + }, + { + "epoch": 0.41602849510240425, + "grad_norm": 0.7084040641784668, + "learning_rate": 1e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.7527274191379547, + "num_tokens": 336705436.0, + "step": 1168 + }, + { + "epoch": 0.4163846838824577, + "grad_norm": 0.6908108592033386, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.750663161277771, + "num_tokens": 336973179.0, + "step": 1169 + }, + { + "epoch": 0.41674087266251114, + "grad_norm": 0.7168322801589966, + "learning_rate": 1e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.747025191783905, + "num_tokens": 337257991.0, + "step": 1170 + }, + { + "epoch": 0.41709706144256453, + "grad_norm": 0.7087938189506531, + "learning_rate": 1e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7641849368810654, + "num_tokens": 337559875.0, + "step": 1171 + }, + { + "epoch": 0.417453250222618, + "grad_norm": 0.7209700345993042, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7310457825660706, + "num_tokens": 337830769.0, + "step": 1172 + }, + { + "epoch": 0.4178094390026714, + "grad_norm": 0.6661319732666016, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7535110712051392, + "num_tokens": 338136372.0, + "step": 1173 + }, + { + "epoch": 0.41816562778272487, + "grad_norm": 0.7001761198043823, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7296863496303558, + "num_tokens": 338442381.0, + "step": 1174 + }, + { + "epoch": 0.41852181656277826, + "grad_norm": 0.736362874507904, + "learning_rate": 1e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7548524290323257, + "num_tokens": 338713776.0, + "step": 1175 + }, + { + "epoch": 0.4188780053428317, + "grad_norm": 0.662046492099762, + "learning_rate": 1e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7697170823812485, + "num_tokens": 339028331.0, + "step": 1176 + }, + { + "epoch": 0.41923419412288515, + "grad_norm": 0.6982311606407166, + "learning_rate": 1e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7554444968700409, + "num_tokens": 339312712.0, + "step": 1177 + }, + { + "epoch": 0.41959038290293854, + "grad_norm": 0.722695529460907, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7347546964883804, + "num_tokens": 339584972.0, + "step": 1178 + }, + { + "epoch": 0.419946571682992, + "grad_norm": 0.7076847553253174, + "learning_rate": 1e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7733012288808823, + "num_tokens": 339855767.0, + "step": 1179 + }, + { + "epoch": 0.42030276046304543, + "grad_norm": 0.7101437449455261, + "learning_rate": 1e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7581965625286102, + "num_tokens": 340147126.0, + "step": 1180 + }, + { + "epoch": 0.4206589492430988, + "grad_norm": 0.6744058728218079, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7592988610267639, + "num_tokens": 340456544.0, + "step": 1181 + }, + { + "epoch": 0.42101513802315227, + "grad_norm": 0.6970324516296387, + "learning_rate": 1e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7672973126173019, + "num_tokens": 340756029.0, + "step": 1182 + }, + { + "epoch": 0.4213713268032057, + "grad_norm": 0.7675511240959167, + "learning_rate": 1e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7531236708164215, + "num_tokens": 341023369.0, + "step": 1183 + }, + { + "epoch": 0.4217275155832591, + "grad_norm": 0.6482188105583191, + "learning_rate": 1e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7584234029054642, + "num_tokens": 341330831.0, + "step": 1184 + }, + { + "epoch": 0.42208370436331255, + "grad_norm": 0.6677564382553101, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7478495985269547, + "num_tokens": 341656676.0, + "step": 1185 + }, + { + "epoch": 0.422439893143366, + "grad_norm": 0.7357364892959595, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7629668712615967, + "num_tokens": 341926479.0, + "step": 1186 + }, + { + "epoch": 0.4227960819234194, + "grad_norm": 0.6668767333030701, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7623128592967987, + "num_tokens": 342222448.0, + "step": 1187 + }, + { + "epoch": 0.42315227070347283, + "grad_norm": 0.7326024174690247, + "learning_rate": 1e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7780580222606659, + "num_tokens": 342489078.0, + "step": 1188 + }, + { + "epoch": 0.4235084594835263, + "grad_norm": 0.6593159437179565, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7434390485286713, + "num_tokens": 342794668.0, + "step": 1189 + }, + { + "epoch": 0.4238646482635797, + "grad_norm": 0.7267928123474121, + "learning_rate": 1e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.7502516657114029, + "num_tokens": 343060073.0, + "step": 1190 + }, + { + "epoch": 0.4242208370436331, + "grad_norm": 0.7046674489974976, + "learning_rate": 1e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7480416297912598, + "num_tokens": 343335652.0, + "step": 1191 + }, + { + "epoch": 0.42457702582368656, + "grad_norm": 0.6518410444259644, + "learning_rate": 1e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7426266968250275, + "num_tokens": 343647160.0, + "step": 1192 + }, + { + "epoch": 0.42493321460374, + "grad_norm": 0.7426651120185852, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7609278112649918, + "num_tokens": 343904138.0, + "step": 1193 + }, + { + "epoch": 0.4252894033837934, + "grad_norm": 0.6652823686599731, + "learning_rate": 1e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7506037056446075, + "num_tokens": 344206893.0, + "step": 1194 + }, + { + "epoch": 0.42564559216384684, + "grad_norm": 0.6682024002075195, + "learning_rate": 1e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7558945119380951, + "num_tokens": 344516448.0, + "step": 1195 + }, + { + "epoch": 0.4260017809439003, + "grad_norm": 0.7125647068023682, + "learning_rate": 1e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7539498507976532, + "num_tokens": 344793591.0, + "step": 1196 + }, + { + "epoch": 0.4263579697239537, + "grad_norm": 0.702404260635376, + "learning_rate": 1e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7616499066352844, + "num_tokens": 345080006.0, + "step": 1197 + }, + { + "epoch": 0.4267141585040071, + "grad_norm": 0.7178996801376343, + "learning_rate": 1e-06, + "loss": 0.794, + "mean_token_accuracy": 0.7519125193357468, + "num_tokens": 345335921.0, + "step": 1198 + }, + { + "epoch": 0.42707034728406057, + "grad_norm": 0.6898183822631836, + "learning_rate": 1e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7496139854192734, + "num_tokens": 345621757.0, + "step": 1199 + }, + { + "epoch": 0.42742653606411396, + "grad_norm": 0.6830015778541565, + "learning_rate": 1e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7564818263053894, + "num_tokens": 345909921.0, + "step": 1200 + }, + { + "epoch": 0.4277827248441674, + "grad_norm": 0.6899113655090332, + "learning_rate": 1e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7543525397777557, + "num_tokens": 346196624.0, + "step": 1201 + }, + { + "epoch": 0.42813891362422085, + "grad_norm": 0.69756019115448, + "learning_rate": 1e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7584922462701797, + "num_tokens": 346489345.0, + "step": 1202 + }, + { + "epoch": 0.42849510240427424, + "grad_norm": 0.6988915205001831, + "learning_rate": 1e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7504533976316452, + "num_tokens": 346774936.0, + "step": 1203 + }, + { + "epoch": 0.4288512911843277, + "grad_norm": 0.7013523578643799, + "learning_rate": 1e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.749071478843689, + "num_tokens": 347060221.0, + "step": 1204 + }, + { + "epoch": 0.42920747996438113, + "grad_norm": 0.7299675941467285, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7420122772455215, + "num_tokens": 347328270.0, + "step": 1205 + }, + { + "epoch": 0.4295636687444345, + "grad_norm": 0.6472702026367188, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.771324872970581, + "num_tokens": 347644559.0, + "step": 1206 + }, + { + "epoch": 0.42991985752448797, + "grad_norm": 0.629683256149292, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7711202949285507, + "num_tokens": 347965786.0, + "step": 1207 + }, + { + "epoch": 0.4302760463045414, + "grad_norm": 0.7087910175323486, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7507094591856003, + "num_tokens": 348242678.0, + "step": 1208 + }, + { + "epoch": 0.43063223508459486, + "grad_norm": 0.7083801031112671, + "learning_rate": 1e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7445443421602249, + "num_tokens": 348513274.0, + "step": 1209 + }, + { + "epoch": 0.43098842386464825, + "grad_norm": 0.6619917154312134, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.748315304517746, + "num_tokens": 348807097.0, + "step": 1210 + }, + { + "epoch": 0.4313446126447017, + "grad_norm": 0.7579136490821838, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7646926641464233, + "num_tokens": 349075762.0, + "step": 1211 + }, + { + "epoch": 0.43170080142475514, + "grad_norm": 0.682807207107544, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7544610500335693, + "num_tokens": 349354308.0, + "step": 1212 + }, + { + "epoch": 0.43205699020480853, + "grad_norm": 0.7285266518592834, + "learning_rate": 1e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7586190104484558, + "num_tokens": 349616255.0, + "step": 1213 + }, + { + "epoch": 0.432413178984862, + "grad_norm": 0.7173811197280884, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7605246901512146, + "num_tokens": 349900093.0, + "step": 1214 + }, + { + "epoch": 0.4327693677649154, + "grad_norm": 0.7154436111450195, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7462532818317413, + "num_tokens": 350187664.0, + "step": 1215 + }, + { + "epoch": 0.4331255565449688, + "grad_norm": 0.7493379712104797, + "learning_rate": 1e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7626645565032959, + "num_tokens": 350441598.0, + "step": 1216 + }, + { + "epoch": 0.43348174532502226, + "grad_norm": 0.6677243113517761, + "learning_rate": 1e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7734937816858292, + "num_tokens": 350731163.0, + "step": 1217 + }, + { + "epoch": 0.4338379341050757, + "grad_norm": 0.6896463632583618, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7677333503961563, + "num_tokens": 351022785.0, + "step": 1218 + }, + { + "epoch": 0.4341941228851291, + "grad_norm": 0.7760282158851624, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7347936481237411, + "num_tokens": 351287586.0, + "step": 1219 + }, + { + "epoch": 0.43455031166518254, + "grad_norm": 0.7529537677764893, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7299040257930756, + "num_tokens": 351527524.0, + "step": 1220 + }, + { + "epoch": 0.434906500445236, + "grad_norm": 0.716805636882782, + "learning_rate": 1e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.760548010468483, + "num_tokens": 351791659.0, + "step": 1221 + }, + { + "epoch": 0.4352626892252894, + "grad_norm": 0.6790550351142883, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7554935365915298, + "num_tokens": 352077403.0, + "step": 1222 + }, + { + "epoch": 0.4356188780053428, + "grad_norm": 0.6690599322319031, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7657830119132996, + "num_tokens": 352385027.0, + "step": 1223 + }, + { + "epoch": 0.43597506678539627, + "grad_norm": 0.6717358827590942, + "learning_rate": 1e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7623239010572433, + "num_tokens": 352654885.0, + "step": 1224 + }, + { + "epoch": 0.4363312555654497, + "grad_norm": 0.6694024801254272, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.746197059750557, + "num_tokens": 352950141.0, + "step": 1225 + }, + { + "epoch": 0.4366874443455031, + "grad_norm": 0.6830923557281494, + "learning_rate": 1e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7671965956687927, + "num_tokens": 353239821.0, + "step": 1226 + }, + { + "epoch": 0.43704363312555655, + "grad_norm": 0.6571145057678223, + "learning_rate": 1e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.746619924902916, + "num_tokens": 353536639.0, + "step": 1227 + }, + { + "epoch": 0.43739982190561, + "grad_norm": 0.6624608635902405, + "learning_rate": 1e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7451523989439011, + "num_tokens": 353860298.0, + "step": 1228 + }, + { + "epoch": 0.4377560106856634, + "grad_norm": 0.6801509857177734, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7545322179794312, + "num_tokens": 354148295.0, + "step": 1229 + }, + { + "epoch": 0.43811219946571683, + "grad_norm": 0.6884328722953796, + "learning_rate": 1e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7534715682268143, + "num_tokens": 354430793.0, + "step": 1230 + }, + { + "epoch": 0.4384683882457703, + "grad_norm": 0.6286947131156921, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7754510641098022, + "num_tokens": 354744054.0, + "step": 1231 + }, + { + "epoch": 0.43882457702582367, + "grad_norm": 0.6788830161094666, + "learning_rate": 1e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7827643752098083, + "num_tokens": 355015368.0, + "step": 1232 + }, + { + "epoch": 0.4391807658058771, + "grad_norm": 0.7252916097640991, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7619216740131378, + "num_tokens": 355282354.0, + "step": 1233 + }, + { + "epoch": 0.43953695458593056, + "grad_norm": 0.7421103715896606, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.732938677072525, + "num_tokens": 355565981.0, + "step": 1234 + }, + { + "epoch": 0.43989314336598395, + "grad_norm": 0.6856145262718201, + "learning_rate": 1e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7587875127792358, + "num_tokens": 355843597.0, + "step": 1235 + }, + { + "epoch": 0.4402493321460374, + "grad_norm": 0.6725185513496399, + "learning_rate": 1e-06, + "loss": 0.7578, + "mean_token_accuracy": 0.7602906823158264, + "num_tokens": 356137715.0, + "step": 1236 + }, + { + "epoch": 0.44060552092609084, + "grad_norm": 0.7551268935203552, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7686867862939835, + "num_tokens": 356411545.0, + "step": 1237 + }, + { + "epoch": 0.44096170970614423, + "grad_norm": 0.710490345954895, + "learning_rate": 1e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7707168012857437, + "num_tokens": 356671196.0, + "step": 1238 + }, + { + "epoch": 0.4413178984861977, + "grad_norm": 0.7837697267532349, + "learning_rate": 1e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7565706670284271, + "num_tokens": 356955725.0, + "step": 1239 + }, + { + "epoch": 0.4416740872662511, + "grad_norm": 0.6603904962539673, + "learning_rate": 1e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7675623148679733, + "num_tokens": 357267491.0, + "step": 1240 + }, + { + "epoch": 0.4420302760463045, + "grad_norm": 0.6214243173599243, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.771060049533844, + "num_tokens": 357594427.0, + "step": 1241 + }, + { + "epoch": 0.44238646482635796, + "grad_norm": 0.729295551776886, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7451503574848175, + "num_tokens": 357873445.0, + "step": 1242 + }, + { + "epoch": 0.4427426536064114, + "grad_norm": 0.6905696392059326, + "learning_rate": 1e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7574450522661209, + "num_tokens": 358169913.0, + "step": 1243 + }, + { + "epoch": 0.44309884238646485, + "grad_norm": 0.6532860994338989, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7484276741743088, + "num_tokens": 358480240.0, + "step": 1244 + }, + { + "epoch": 0.44345503116651824, + "grad_norm": 0.6935100555419922, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7278909385204315, + "num_tokens": 358762525.0, + "step": 1245 + }, + { + "epoch": 0.4438112199465717, + "grad_norm": 0.7006093859672546, + "learning_rate": 1e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7553815692663193, + "num_tokens": 359068237.0, + "step": 1246 + }, + { + "epoch": 0.44416740872662513, + "grad_norm": 0.6830838322639465, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7446834892034531, + "num_tokens": 359377932.0, + "step": 1247 + }, + { + "epoch": 0.4445235975066785, + "grad_norm": 0.7121174335479736, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7592355161905289, + "num_tokens": 359656260.0, + "step": 1248 + }, + { + "epoch": 0.44487978628673197, + "grad_norm": 0.7017717957496643, + "learning_rate": 1e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7620732188224792, + "num_tokens": 359943351.0, + "step": 1249 + }, + { + "epoch": 0.4452359750667854, + "grad_norm": 0.7461994886398315, + "learning_rate": 1e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.751348540186882, + "num_tokens": 360218776.0, + "step": 1250 + }, + { + "epoch": 0.4455921638468388, + "grad_norm": 0.7186580896377563, + "learning_rate": 1e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7472657114267349, + "num_tokens": 360496325.0, + "step": 1251 + }, + { + "epoch": 0.44594835262689225, + "grad_norm": 0.7097272276878357, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7366271018981934, + "num_tokens": 360760337.0, + "step": 1252 + }, + { + "epoch": 0.4463045414069457, + "grad_norm": 0.7222106456756592, + "learning_rate": 1e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.7650704681873322, + "num_tokens": 361014311.0, + "step": 1253 + }, + { + "epoch": 0.4466607301869991, + "grad_norm": 0.6920763254165649, + "learning_rate": 1e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7567969113588333, + "num_tokens": 361300041.0, + "step": 1254 + }, + { + "epoch": 0.44701691896705253, + "grad_norm": 0.6744621396064758, + "learning_rate": 1e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7635846138000488, + "num_tokens": 361593758.0, + "step": 1255 + }, + { + "epoch": 0.447373107747106, + "grad_norm": 0.7117516994476318, + "learning_rate": 1e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7491289526224136, + "num_tokens": 361902821.0, + "step": 1256 + }, + { + "epoch": 0.44772929652715937, + "grad_norm": 0.6392377614974976, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7739770412445068, + "num_tokens": 362204965.0, + "step": 1257 + }, + { + "epoch": 0.4480854853072128, + "grad_norm": 0.6503801345825195, + "learning_rate": 1e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.7852642983198166, + "num_tokens": 362499941.0, + "step": 1258 + }, + { + "epoch": 0.44844167408726626, + "grad_norm": 0.7486396431922913, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7365526258945465, + "num_tokens": 362746371.0, + "step": 1259 + }, + { + "epoch": 0.44879786286731965, + "grad_norm": 0.6769547462463379, + "learning_rate": 1e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7669253200292587, + "num_tokens": 363057317.0, + "step": 1260 + }, + { + "epoch": 0.4491540516473731, + "grad_norm": 0.6672478914260864, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7724129855632782, + "num_tokens": 363322254.0, + "step": 1261 + }, + { + "epoch": 0.44951024042742654, + "grad_norm": 0.6650082468986511, + "learning_rate": 1e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7521748840808868, + "num_tokens": 363613474.0, + "step": 1262 + }, + { + "epoch": 0.44986642920748, + "grad_norm": 0.7090915441513062, + "learning_rate": 1e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7531610876321793, + "num_tokens": 363891393.0, + "step": 1263 + }, + { + "epoch": 0.4502226179875334, + "grad_norm": 0.7134104371070862, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7542632967233658, + "num_tokens": 364158955.0, + "step": 1264 + }, + { + "epoch": 0.4505788067675868, + "grad_norm": 0.6934625506401062, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7413288354873657, + "num_tokens": 364451915.0, + "step": 1265 + }, + { + "epoch": 0.45093499554764027, + "grad_norm": 0.6992862820625305, + "learning_rate": 1e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.7484667897224426, + "num_tokens": 364729613.0, + "step": 1266 + }, + { + "epoch": 0.45129118432769366, + "grad_norm": 0.6782872676849365, + "learning_rate": 1e-06, + "loss": 0.7781, + "mean_token_accuracy": 0.7568159401416779, + "num_tokens": 365034420.0, + "step": 1267 + }, + { + "epoch": 0.4516473731077471, + "grad_norm": 0.7741929888725281, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7410172373056412, + "num_tokens": 365301011.0, + "step": 1268 + }, + { + "epoch": 0.45200356188780055, + "grad_norm": 0.6537835001945496, + "learning_rate": 1e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.7553246319293976, + "num_tokens": 365621156.0, + "step": 1269 + }, + { + "epoch": 0.45235975066785394, + "grad_norm": 0.7010008096694946, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7426173090934753, + "num_tokens": 365913291.0, + "step": 1270 + }, + { + "epoch": 0.4527159394479074, + "grad_norm": 0.6893748641014099, + "learning_rate": 1e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7551710158586502, + "num_tokens": 366185021.0, + "step": 1271 + }, + { + "epoch": 0.45307212822796084, + "grad_norm": 0.6784184575080872, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7488010227680206, + "num_tokens": 366476661.0, + "step": 1272 + }, + { + "epoch": 0.4534283170080142, + "grad_norm": 0.690559983253479, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7540045827627182, + "num_tokens": 366757628.0, + "step": 1273 + }, + { + "epoch": 0.45378450578806767, + "grad_norm": 0.708936333656311, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7429597526788712, + "num_tokens": 367056365.0, + "step": 1274 + }, + { + "epoch": 0.4541406945681211, + "grad_norm": 0.6555215120315552, + "learning_rate": 1e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7523055374622345, + "num_tokens": 367368027.0, + "step": 1275 + }, + { + "epoch": 0.4544968833481745, + "grad_norm": 0.6647518277168274, + "learning_rate": 1e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7793429344892502, + "num_tokens": 367659892.0, + "step": 1276 + }, + { + "epoch": 0.45485307212822795, + "grad_norm": 0.6581811904907227, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7433116137981415, + "num_tokens": 367946157.0, + "step": 1277 + }, + { + "epoch": 0.4552092609082814, + "grad_norm": 0.6645776629447937, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7707825303077698, + "num_tokens": 368252892.0, + "step": 1278 + }, + { + "epoch": 0.45556544968833484, + "grad_norm": 0.6891472935676575, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7459191530942917, + "num_tokens": 368543880.0, + "step": 1279 + }, + { + "epoch": 0.45592163846838824, + "grad_norm": 0.6555699110031128, + "learning_rate": 1e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7574552148580551, + "num_tokens": 368879782.0, + "step": 1280 + }, + { + "epoch": 0.4562778272484417, + "grad_norm": 0.647689938545227, + "learning_rate": 1e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7500459402799606, + "num_tokens": 369198853.0, + "step": 1281 + }, + { + "epoch": 0.4566340160284951, + "grad_norm": 0.7121984958648682, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7352192997932434, + "num_tokens": 369480147.0, + "step": 1282 + }, + { + "epoch": 0.4569902048085485, + "grad_norm": 0.7364363670349121, + "learning_rate": 1e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7673592567443848, + "num_tokens": 369755745.0, + "step": 1283 + }, + { + "epoch": 0.45734639358860196, + "grad_norm": 0.693638801574707, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7376826256513596, + "num_tokens": 370078512.0, + "step": 1284 + }, + { + "epoch": 0.4577025823686554, + "grad_norm": 0.7185516357421875, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7249381393194199, + "num_tokens": 370350114.0, + "step": 1285 + }, + { + "epoch": 0.4580587711487088, + "grad_norm": 0.6767480373382568, + "learning_rate": 1e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7565873563289642, + "num_tokens": 370631845.0, + "step": 1286 + }, + { + "epoch": 0.45841495992876224, + "grad_norm": 0.7035843729972839, + "learning_rate": 1e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7558070570230484, + "num_tokens": 370909410.0, + "step": 1287 + }, + { + "epoch": 0.4587711487088157, + "grad_norm": 0.7084554433822632, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.737004354596138, + "num_tokens": 371188191.0, + "step": 1288 + }, + { + "epoch": 0.4591273374888691, + "grad_norm": 0.7197967767715454, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7476942390203476, + "num_tokens": 371488828.0, + "step": 1289 + }, + { + "epoch": 0.4594835262689225, + "grad_norm": 0.6519590616226196, + "learning_rate": 1e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7703315913677216, + "num_tokens": 371801167.0, + "step": 1290 + }, + { + "epoch": 0.45983971504897597, + "grad_norm": 0.6492793560028076, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7577568292617798, + "num_tokens": 372146016.0, + "step": 1291 + }, + { + "epoch": 0.46019590382902936, + "grad_norm": 0.6878310441970825, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7374742031097412, + "num_tokens": 372427735.0, + "step": 1292 + }, + { + "epoch": 0.4605520926090828, + "grad_norm": 0.6926255226135254, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7496969550848007, + "num_tokens": 372718496.0, + "step": 1293 + }, + { + "epoch": 0.46090828138913625, + "grad_norm": 0.7146655321121216, + "learning_rate": 1e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7597076296806335, + "num_tokens": 372989967.0, + "step": 1294 + }, + { + "epoch": 0.46126447016918964, + "grad_norm": 0.772001326084137, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7325578033924103, + "num_tokens": 373234321.0, + "step": 1295 + }, + { + "epoch": 0.4616206589492431, + "grad_norm": 0.7040675282478333, + "learning_rate": 1e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7573638558387756, + "num_tokens": 373526250.0, + "step": 1296 + }, + { + "epoch": 0.46197684772929654, + "grad_norm": 0.6864003539085388, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7413443028926849, + "num_tokens": 373808979.0, + "step": 1297 + }, + { + "epoch": 0.46233303650935, + "grad_norm": 0.7187809944152832, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7531770169734955, + "num_tokens": 374095183.0, + "step": 1298 + }, + { + "epoch": 0.46268922528940337, + "grad_norm": 0.7334157228469849, + "learning_rate": 1e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7635949999094009, + "num_tokens": 374357532.0, + "step": 1299 + }, + { + "epoch": 0.4630454140694568, + "grad_norm": 0.6870716214179993, + "learning_rate": 1e-06, + "loss": 0.7169, + "mean_token_accuracy": 0.77097487449646, + "num_tokens": 374652765.0, + "step": 1300 + }, + { + "epoch": 0.46340160284951026, + "grad_norm": 0.6796295046806335, + "learning_rate": 1e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7580841779708862, + "num_tokens": 374925703.0, + "step": 1301 + }, + { + "epoch": 0.46375779162956365, + "grad_norm": 0.6896849870681763, + "learning_rate": 1e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7410407960414886, + "num_tokens": 375223836.0, + "step": 1302 + }, + { + "epoch": 0.4641139804096171, + "grad_norm": 0.7673896551132202, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7559596300125122, + "num_tokens": 375502154.0, + "step": 1303 + }, + { + "epoch": 0.46447016918967055, + "grad_norm": 0.7003068923950195, + "learning_rate": 1e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.780892550945282, + "num_tokens": 375786166.0, + "step": 1304 + }, + { + "epoch": 0.46482635796972394, + "grad_norm": 0.6817623972892761, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7650084942579269, + "num_tokens": 376069325.0, + "step": 1305 + }, + { + "epoch": 0.4651825467497774, + "grad_norm": 0.7526620030403137, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7455774992704391, + "num_tokens": 376324549.0, + "step": 1306 + }, + { + "epoch": 0.4655387355298308, + "grad_norm": 0.7050649523735046, + "learning_rate": 1e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7645736187696457, + "num_tokens": 376607697.0, + "step": 1307 + }, + { + "epoch": 0.4658949243098842, + "grad_norm": 0.6953793168067932, + "learning_rate": 1e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7552141398191452, + "num_tokens": 376901094.0, + "step": 1308 + }, + { + "epoch": 0.46625111308993766, + "grad_norm": 0.7482753992080688, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7332423776388168, + "num_tokens": 377177253.0, + "step": 1309 + }, + { + "epoch": 0.4666073018699911, + "grad_norm": 0.6894402503967285, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7383391708135605, + "num_tokens": 377453702.0, + "step": 1310 + }, + { + "epoch": 0.4669634906500445, + "grad_norm": 0.7166628837585449, + "learning_rate": 1e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7504431009292603, + "num_tokens": 377733709.0, + "step": 1311 + }, + { + "epoch": 0.46731967943009795, + "grad_norm": 0.726115882396698, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7567795813083649, + "num_tokens": 378008002.0, + "step": 1312 + }, + { + "epoch": 0.4676758682101514, + "grad_norm": 0.6603044867515564, + "learning_rate": 1e-06, + "loss": 0.6766, + "mean_token_accuracy": 0.781524047255516, + "num_tokens": 378308757.0, + "step": 1313 + }, + { + "epoch": 0.4680320569902048, + "grad_norm": 0.6462889313697815, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7667070627212524, + "num_tokens": 378627664.0, + "step": 1314 + }, + { + "epoch": 0.4683882457702582, + "grad_norm": 0.631384015083313, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7442147731781006, + "num_tokens": 378936949.0, + "step": 1315 + }, + { + "epoch": 0.4687444345503117, + "grad_norm": 0.6824116706848145, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7428151667118073, + "num_tokens": 379226842.0, + "step": 1316 + }, + { + "epoch": 0.4691006233303651, + "grad_norm": 0.6776832938194275, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7547532320022583, + "num_tokens": 379507502.0, + "step": 1317 + }, + { + "epoch": 0.4694568121104185, + "grad_norm": 0.6637802124023438, + "learning_rate": 1e-06, + "loss": 0.7821, + "mean_token_accuracy": 0.7627327591180801, + "num_tokens": 379798873.0, + "step": 1318 + }, + { + "epoch": 0.46981300089047195, + "grad_norm": 0.729166567325592, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7524060755968094, + "num_tokens": 380048992.0, + "step": 1319 + }, + { + "epoch": 0.4701691896705254, + "grad_norm": 0.6714612245559692, + "learning_rate": 1e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.76903036236763, + "num_tokens": 380350849.0, + "step": 1320 + }, + { + "epoch": 0.4705253784505788, + "grad_norm": 0.6939135193824768, + "learning_rate": 1e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7586372047662735, + "num_tokens": 380664399.0, + "step": 1321 + }, + { + "epoch": 0.47088156723063224, + "grad_norm": 0.7205054759979248, + "learning_rate": 1e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.758786991238594, + "num_tokens": 380980432.0, + "step": 1322 + }, + { + "epoch": 0.4712377560106857, + "grad_norm": 0.6585839986801147, + "learning_rate": 1e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7645269334316254, + "num_tokens": 381295681.0, + "step": 1323 + }, + { + "epoch": 0.4715939447907391, + "grad_norm": 0.6832968592643738, + "learning_rate": 1e-06, + "loss": 0.8016, + "mean_token_accuracy": 0.7504745125770569, + "num_tokens": 381586092.0, + "step": 1324 + }, + { + "epoch": 0.4719501335707925, + "grad_norm": 0.6858633160591125, + "learning_rate": 1e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7586767375469208, + "num_tokens": 381909876.0, + "step": 1325 + }, + { + "epoch": 0.47230632235084596, + "grad_norm": 0.7141995429992676, + "learning_rate": 1e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7469089329242706, + "num_tokens": 382220802.0, + "step": 1326 + }, + { + "epoch": 0.47266251113089935, + "grad_norm": 0.6833315491676331, + "learning_rate": 1e-06, + "loss": 0.7865, + "mean_token_accuracy": 0.7544162273406982, + "num_tokens": 382509611.0, + "step": 1327 + }, + { + "epoch": 0.4730186999109528, + "grad_norm": 0.7300421595573425, + "learning_rate": 1e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.7471490204334259, + "num_tokens": 382788118.0, + "step": 1328 + }, + { + "epoch": 0.47337488869100625, + "grad_norm": 0.7057307362556458, + "learning_rate": 1e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7422066032886505, + "num_tokens": 383081565.0, + "step": 1329 + }, + { + "epoch": 0.47373107747105964, + "grad_norm": 0.7300113439559937, + "learning_rate": 1e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7436651140451431, + "num_tokens": 383349345.0, + "step": 1330 + }, + { + "epoch": 0.4740872662511131, + "grad_norm": 0.6833487749099731, + "learning_rate": 1e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7568764537572861, + "num_tokens": 383638241.0, + "step": 1331 + }, + { + "epoch": 0.4744434550311665, + "grad_norm": 0.7064011096954346, + "learning_rate": 1e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7537237107753754, + "num_tokens": 383926439.0, + "step": 1332 + }, + { + "epoch": 0.47479964381122, + "grad_norm": 0.7162995934486389, + "learning_rate": 1e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.7492009252309799, + "num_tokens": 384218985.0, + "step": 1333 + }, + { + "epoch": 0.47515583259127336, + "grad_norm": 0.6965609788894653, + "learning_rate": 1e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7633824646472931, + "num_tokens": 384528640.0, + "step": 1334 + }, + { + "epoch": 0.4755120213713268, + "grad_norm": 0.704565167427063, + "learning_rate": 1e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7667969465255737, + "num_tokens": 384810854.0, + "step": 1335 + }, + { + "epoch": 0.47586821015138026, + "grad_norm": 0.6448087692260742, + "learning_rate": 1e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7548262476921082, + "num_tokens": 385130579.0, + "step": 1336 + }, + { + "epoch": 0.47622439893143365, + "grad_norm": 0.6938846111297607, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7475506067276001, + "num_tokens": 385434809.0, + "step": 1337 + }, + { + "epoch": 0.4765805877114871, + "grad_norm": 0.7132769227027893, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7461777627468109, + "num_tokens": 385698189.0, + "step": 1338 + }, + { + "epoch": 0.47693677649154054, + "grad_norm": 0.7102010250091553, + "learning_rate": 1e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.7428551912307739, + "num_tokens": 385969583.0, + "step": 1339 + }, + { + "epoch": 0.4772929652715939, + "grad_norm": 0.7076168656349182, + "learning_rate": 1e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7563647776842117, + "num_tokens": 386266072.0, + "step": 1340 + }, + { + "epoch": 0.4776491540516474, + "grad_norm": 0.7421818971633911, + "learning_rate": 1e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7704915404319763, + "num_tokens": 386522343.0, + "step": 1341 + }, + { + "epoch": 0.4780053428317008, + "grad_norm": 0.6582825183868408, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7661447674036026, + "num_tokens": 386812952.0, + "step": 1342 + }, + { + "epoch": 0.4783615316117542, + "grad_norm": 0.6761593818664551, + "learning_rate": 1e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7524781972169876, + "num_tokens": 387111572.0, + "step": 1343 + }, + { + "epoch": 0.47871772039180766, + "grad_norm": 0.6711142063140869, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7599764913320541, + "num_tokens": 387401366.0, + "step": 1344 + }, + { + "epoch": 0.4790739091718611, + "grad_norm": 0.6353307366371155, + "learning_rate": 1e-06, + "loss": 0.781, + "mean_token_accuracy": 0.7565484344959259, + "num_tokens": 387731058.0, + "step": 1345 + }, + { + "epoch": 0.4794300979519145, + "grad_norm": 0.6770610213279724, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7414587885141373, + "num_tokens": 388028676.0, + "step": 1346 + }, + { + "epoch": 0.47978628673196794, + "grad_norm": 0.6443505883216858, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7450805604457855, + "num_tokens": 388348648.0, + "step": 1347 + }, + { + "epoch": 0.4801424755120214, + "grad_norm": 0.6616813540458679, + "learning_rate": 1e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7624663710594177, + "num_tokens": 388668421.0, + "step": 1348 + }, + { + "epoch": 0.4804986642920748, + "grad_norm": 0.772908627986908, + "learning_rate": 1e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7541593760251999, + "num_tokens": 388938038.0, + "step": 1349 + }, + { + "epoch": 0.4808548530721282, + "grad_norm": 0.6933234930038452, + "learning_rate": 1e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7540897727012634, + "num_tokens": 389228577.0, + "step": 1350 + }, + { + "epoch": 0.48121104185218166, + "grad_norm": 0.7286231517791748, + "learning_rate": 1e-06, + "loss": 0.7645, + "mean_token_accuracy": 0.7577617764472961, + "num_tokens": 389461281.0, + "step": 1351 + }, + { + "epoch": 0.4815672306322351, + "grad_norm": 0.7242470383644104, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7454307526350021, + "num_tokens": 389738265.0, + "step": 1352 + }, + { + "epoch": 0.4819234194122885, + "grad_norm": 0.7167354226112366, + "learning_rate": 1e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7517776042222977, + "num_tokens": 389999797.0, + "step": 1353 + }, + { + "epoch": 0.48227960819234195, + "grad_norm": 0.6906574964523315, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7528944164514542, + "num_tokens": 390266451.0, + "step": 1354 + }, + { + "epoch": 0.4826357969723954, + "grad_norm": 0.6442222595214844, + "learning_rate": 1e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7544716894626617, + "num_tokens": 390577469.0, + "step": 1355 + }, + { + "epoch": 0.4829919857524488, + "grad_norm": 0.6484500765800476, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7590746134519577, + "num_tokens": 390883477.0, + "step": 1356 + }, + { + "epoch": 0.48334817453250223, + "grad_norm": 0.6823197603225708, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7560841590166092, + "num_tokens": 391185245.0, + "step": 1357 + }, + { + "epoch": 0.4837043633125557, + "grad_norm": 0.7083442807197571, + "learning_rate": 1e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7469482421875, + "num_tokens": 391474860.0, + "step": 1358 + }, + { + "epoch": 0.48406055209260906, + "grad_norm": 0.6886212825775146, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7435162216424942, + "num_tokens": 391760889.0, + "step": 1359 + }, + { + "epoch": 0.4844167408726625, + "grad_norm": 0.6715748906135559, + "learning_rate": 1e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.7572074383497238, + "num_tokens": 392050697.0, + "step": 1360 + }, + { + "epoch": 0.48477292965271596, + "grad_norm": 0.7361807227134705, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7366000711917877, + "num_tokens": 392318900.0, + "step": 1361 + }, + { + "epoch": 0.48512911843276935, + "grad_norm": 0.6926305294036865, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7678073048591614, + "num_tokens": 392595135.0, + "step": 1362 + }, + { + "epoch": 0.4854853072128228, + "grad_norm": 0.6363987326622009, + "learning_rate": 1e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.774316668510437, + "num_tokens": 392902921.0, + "step": 1363 + }, + { + "epoch": 0.48584149599287624, + "grad_norm": 0.7317789196968079, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7413395643234253, + "num_tokens": 393195360.0, + "step": 1364 + }, + { + "epoch": 0.48619768477292963, + "grad_norm": 0.6366987824440002, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7676767259836197, + "num_tokens": 393494395.0, + "step": 1365 + }, + { + "epoch": 0.4865538735529831, + "grad_norm": 0.6908860802650452, + "learning_rate": 1e-06, + "loss": 0.7621, + "mean_token_accuracy": 0.7626542448997498, + "num_tokens": 393755777.0, + "step": 1366 + }, + { + "epoch": 0.4869100623330365, + "grad_norm": 0.6699550747871399, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7612812370061874, + "num_tokens": 394046567.0, + "step": 1367 + }, + { + "epoch": 0.48726625111308997, + "grad_norm": 0.6806095242500305, + "learning_rate": 1e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7511949241161346, + "num_tokens": 394327536.0, + "step": 1368 + }, + { + "epoch": 0.48762243989314336, + "grad_norm": 0.6751295924186707, + "learning_rate": 1e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7558742612600327, + "num_tokens": 394608826.0, + "step": 1369 + }, + { + "epoch": 0.4879786286731968, + "grad_norm": 0.6744613647460938, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7654305547475815, + "num_tokens": 394891354.0, + "step": 1370 + }, + { + "epoch": 0.48833481745325025, + "grad_norm": 0.7159210443496704, + "learning_rate": 1e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7559211552143097, + "num_tokens": 395149595.0, + "step": 1371 + }, + { + "epoch": 0.48869100623330364, + "grad_norm": 0.7601350545883179, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7474627047777176, + "num_tokens": 395389449.0, + "step": 1372 + }, + { + "epoch": 0.4890471950133571, + "grad_norm": 0.7398225665092468, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7564377933740616, + "num_tokens": 395652032.0, + "step": 1373 + }, + { + "epoch": 0.48940338379341053, + "grad_norm": 0.7370915412902832, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7601025700569153, + "num_tokens": 395930969.0, + "step": 1374 + }, + { + "epoch": 0.4897595725734639, + "grad_norm": 0.66384357213974, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7638956755399704, + "num_tokens": 396220314.0, + "step": 1375 + }, + { + "epoch": 0.49011576135351737, + "grad_norm": 0.72015780210495, + "learning_rate": 1e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7512420266866684, + "num_tokens": 396478225.0, + "step": 1376 + }, + { + "epoch": 0.4904719501335708, + "grad_norm": 0.6338617205619812, + "learning_rate": 1e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7488253116607666, + "num_tokens": 396803183.0, + "step": 1377 + }, + { + "epoch": 0.4908281389136242, + "grad_norm": 0.7006382346153259, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7428996860980988, + "num_tokens": 397090079.0, + "step": 1378 + }, + { + "epoch": 0.49118432769367765, + "grad_norm": 0.71150803565979, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7623085081577301, + "num_tokens": 397369466.0, + "step": 1379 + }, + { + "epoch": 0.4915405164737311, + "grad_norm": 0.7335726022720337, + "learning_rate": 1e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.7723268270492554, + "num_tokens": 397643288.0, + "step": 1380 + }, + { + "epoch": 0.4918967052537845, + "grad_norm": 0.7369585633277893, + "learning_rate": 1e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.758247971534729, + "num_tokens": 397906790.0, + "step": 1381 + }, + { + "epoch": 0.49225289403383793, + "grad_norm": 0.7087600231170654, + "learning_rate": 1e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7514339685440063, + "num_tokens": 398188535.0, + "step": 1382 + }, + { + "epoch": 0.4926090828138914, + "grad_norm": 0.8316262364387512, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7470463067293167, + "num_tokens": 398427918.0, + "step": 1383 + }, + { + "epoch": 0.49296527159394476, + "grad_norm": 0.6975904703140259, + "learning_rate": 1e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7558309733867645, + "num_tokens": 398726638.0, + "step": 1384 + }, + { + "epoch": 0.4933214603739982, + "grad_norm": 0.6775891780853271, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7595167905092239, + "num_tokens": 399019097.0, + "step": 1385 + }, + { + "epoch": 0.49367764915405166, + "grad_norm": 0.7513808608055115, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7589735984802246, + "num_tokens": 399315668.0, + "step": 1386 + }, + { + "epoch": 0.4940338379341051, + "grad_norm": 0.7420421242713928, + "learning_rate": 1e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7623849958181381, + "num_tokens": 399567821.0, + "step": 1387 + }, + { + "epoch": 0.4943900267141585, + "grad_norm": 0.7111037969589233, + "learning_rate": 1e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7634346038103104, + "num_tokens": 399853472.0, + "step": 1388 + }, + { + "epoch": 0.49474621549421194, + "grad_norm": 0.6540206670761108, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7585694938898087, + "num_tokens": 400147611.0, + "step": 1389 + }, + { + "epoch": 0.4951024042742654, + "grad_norm": 0.6545911431312561, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7658095806837082, + "num_tokens": 400457846.0, + "step": 1390 + }, + { + "epoch": 0.4954585930543188, + "grad_norm": 0.6556164026260376, + "learning_rate": 1e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7622440308332443, + "num_tokens": 400753608.0, + "step": 1391 + }, + { + "epoch": 0.4958147818343722, + "grad_norm": 0.6356117725372314, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.748801976442337, + "num_tokens": 401065028.0, + "step": 1392 + }, + { + "epoch": 0.49617097061442567, + "grad_norm": 0.7099463939666748, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7548068463802338, + "num_tokens": 401330729.0, + "step": 1393 + }, + { + "epoch": 0.49652715939447906, + "grad_norm": 0.7039507627487183, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7540329694747925, + "num_tokens": 401607909.0, + "step": 1394 + }, + { + "epoch": 0.4968833481745325, + "grad_norm": 0.6932931542396545, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7425272017717361, + "num_tokens": 401885929.0, + "step": 1395 + }, + { + "epoch": 0.49723953695458595, + "grad_norm": 0.6790148615837097, + "learning_rate": 1e-06, + "loss": 0.729, + "mean_token_accuracy": 0.768785685300827, + "num_tokens": 402169630.0, + "step": 1396 + }, + { + "epoch": 0.49759572573463934, + "grad_norm": 0.6664863228797913, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7619289606809616, + "num_tokens": 402460978.0, + "step": 1397 + }, + { + "epoch": 0.4979519145146928, + "grad_norm": 0.7006672024726868, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7564103454351425, + "num_tokens": 402746899.0, + "step": 1398 + }, + { + "epoch": 0.49830810329474623, + "grad_norm": 0.7589887976646423, + "learning_rate": 1e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7455973625183105, + "num_tokens": 402999897.0, + "step": 1399 + }, + { + "epoch": 0.4986642920747996, + "grad_norm": 0.6770901679992676, + "learning_rate": 1e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7746555358171463, + "num_tokens": 403292234.0, + "step": 1400 + }, + { + "epoch": 0.49902048085485307, + "grad_norm": 0.6847586631774902, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7388576120138168, + "num_tokens": 403577584.0, + "step": 1401 + }, + { + "epoch": 0.4993766696349065, + "grad_norm": 0.6884822249412537, + "learning_rate": 1e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7707542479038239, + "num_tokens": 403864562.0, + "step": 1402 + }, + { + "epoch": 0.4997328584149599, + "grad_norm": 0.7346857786178589, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7464113235473633, + "num_tokens": 404114984.0, + "step": 1403 + }, + { + "epoch": 0.5000890471950133, + "grad_norm": 0.7096836566925049, + "learning_rate": 1e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7701044976711273, + "num_tokens": 404415666.0, + "step": 1404 + }, + { + "epoch": 0.5004452359750667, + "grad_norm": 0.6648792624473572, + "learning_rate": 1e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7642274051904678, + "num_tokens": 404725087.0, + "step": 1405 + }, + { + "epoch": 0.5008014247551202, + "grad_norm": 0.7093903422355652, + "learning_rate": 1e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7697034627199173, + "num_tokens": 405000549.0, + "step": 1406 + }, + { + "epoch": 0.5011576135351736, + "grad_norm": 0.7074877619743347, + "learning_rate": 1e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7523920089006424, + "num_tokens": 405295147.0, + "step": 1407 + }, + { + "epoch": 0.501513802315227, + "grad_norm": 0.7613853812217712, + "learning_rate": 1e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7643864452838898, + "num_tokens": 405556822.0, + "step": 1408 + }, + { + "epoch": 0.5018699910952805, + "grad_norm": 0.7249979972839355, + "learning_rate": 1e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.753338411450386, + "num_tokens": 405822579.0, + "step": 1409 + }, + { + "epoch": 0.5022261798753339, + "grad_norm": 0.6702980399131775, + "learning_rate": 1e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7681991308927536, + "num_tokens": 406115871.0, + "step": 1410 + }, + { + "epoch": 0.5025823686553873, + "grad_norm": 0.7172116041183472, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7619708925485611, + "num_tokens": 406405598.0, + "step": 1411 + }, + { + "epoch": 0.5029385574354408, + "grad_norm": 0.7171390652656555, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7626523971557617, + "num_tokens": 406660274.0, + "step": 1412 + }, + { + "epoch": 0.5032947462154942, + "grad_norm": 0.639884889125824, + "learning_rate": 1e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.760586142539978, + "num_tokens": 406985757.0, + "step": 1413 + }, + { + "epoch": 0.5036509349955477, + "grad_norm": 0.727424681186676, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7584909945726395, + "num_tokens": 407285874.0, + "step": 1414 + }, + { + "epoch": 0.5040071237756011, + "grad_norm": 0.7068268060684204, + "learning_rate": 1e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7551449537277222, + "num_tokens": 407554585.0, + "step": 1415 + }, + { + "epoch": 0.5043633125556545, + "grad_norm": 0.662970781326294, + "learning_rate": 1e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7708410173654556, + "num_tokens": 407860196.0, + "step": 1416 + }, + { + "epoch": 0.504719501335708, + "grad_norm": 0.6769307851791382, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7542339265346527, + "num_tokens": 408149280.0, + "step": 1417 + }, + { + "epoch": 0.5050756901157614, + "grad_norm": 0.7274636030197144, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7672930955886841, + "num_tokens": 408426120.0, + "step": 1418 + }, + { + "epoch": 0.5054318788958148, + "grad_norm": 0.7458218932151794, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7419565767049789, + "num_tokens": 408701039.0, + "step": 1419 + }, + { + "epoch": 0.5057880676758683, + "grad_norm": 0.6971063017845154, + "learning_rate": 1e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.7590503096580505, + "num_tokens": 408979628.0, + "step": 1420 + }, + { + "epoch": 0.5061442564559216, + "grad_norm": 0.64324551820755, + "learning_rate": 1e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7629604786634445, + "num_tokens": 409304581.0, + "step": 1421 + }, + { + "epoch": 0.506500445235975, + "grad_norm": 0.6828421354293823, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7494462430477142, + "num_tokens": 409598662.0, + "step": 1422 + }, + { + "epoch": 0.5068566340160285, + "grad_norm": 0.731465220451355, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7628789395093918, + "num_tokens": 409880538.0, + "step": 1423 + }, + { + "epoch": 0.5072128227960819, + "grad_norm": 0.6458585858345032, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7678451687097549, + "num_tokens": 410208788.0, + "step": 1424 + }, + { + "epoch": 0.5075690115761353, + "grad_norm": 0.6962553262710571, + "learning_rate": 1e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7566481530666351, + "num_tokens": 410487754.0, + "step": 1425 + }, + { + "epoch": 0.5079252003561888, + "grad_norm": 0.6964324712753296, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7389775067567825, + "num_tokens": 410773434.0, + "step": 1426 + }, + { + "epoch": 0.5082813891362422, + "grad_norm": 0.722234845161438, + "learning_rate": 1e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7414982467889786, + "num_tokens": 411030527.0, + "step": 1427 + }, + { + "epoch": 0.5086375779162956, + "grad_norm": 0.661239743232727, + "learning_rate": 1e-06, + "loss": 0.7701, + "mean_token_accuracy": 0.7621628642082214, + "num_tokens": 411326299.0, + "step": 1428 + }, + { + "epoch": 0.5089937666963491, + "grad_norm": 0.7278502583503723, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7420754283666611, + "num_tokens": 411598268.0, + "step": 1429 + }, + { + "epoch": 0.5093499554764025, + "grad_norm": 0.679451584815979, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7538231164216995, + "num_tokens": 411905379.0, + "step": 1430 + }, + { + "epoch": 0.5097061442564559, + "grad_norm": 0.6851945519447327, + "learning_rate": 1e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7505689114332199, + "num_tokens": 412185608.0, + "step": 1431 + }, + { + "epoch": 0.5100623330365094, + "grad_norm": 0.7471150755882263, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7417658120393753, + "num_tokens": 412457567.0, + "step": 1432 + }, + { + "epoch": 0.5104185218165628, + "grad_norm": 0.6859697699546814, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7677827626466751, + "num_tokens": 412746811.0, + "step": 1433 + }, + { + "epoch": 0.5107747105966162, + "grad_norm": 0.7280901670455933, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.770003154873848, + "num_tokens": 413013076.0, + "step": 1434 + }, + { + "epoch": 0.5111308993766697, + "grad_norm": 0.7191663980484009, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7405661195516586, + "num_tokens": 413281939.0, + "step": 1435 + }, + { + "epoch": 0.5114870881567231, + "grad_norm": 0.6529797911643982, + "learning_rate": 1e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7678494453430176, + "num_tokens": 413582115.0, + "step": 1436 + }, + { + "epoch": 0.5118432769367764, + "grad_norm": 0.7090849876403809, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7354100048542023, + "num_tokens": 413854283.0, + "step": 1437 + }, + { + "epoch": 0.51219946571683, + "grad_norm": 0.6938977837562561, + "learning_rate": 1e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7479419857263565, + "num_tokens": 414157053.0, + "step": 1438 + }, + { + "epoch": 0.5125556544968833, + "grad_norm": 0.714258074760437, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.752218097448349, + "num_tokens": 414419953.0, + "step": 1439 + }, + { + "epoch": 0.5129118432769367, + "grad_norm": 0.701742947101593, + "learning_rate": 1e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.7618148475885391, + "num_tokens": 414712675.0, + "step": 1440 + }, + { + "epoch": 0.5132680320569902, + "grad_norm": 0.683506429195404, + "learning_rate": 1e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7441318929195404, + "num_tokens": 415006074.0, + "step": 1441 + }, + { + "epoch": 0.5136242208370436, + "grad_norm": 0.6637641787528992, + "learning_rate": 1e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7618581801652908, + "num_tokens": 415316354.0, + "step": 1442 + }, + { + "epoch": 0.513980409617097, + "grad_norm": 0.6962178349494934, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7627944648265839, + "num_tokens": 415587936.0, + "step": 1443 + }, + { + "epoch": 0.5143365983971505, + "grad_norm": 0.7001758813858032, + "learning_rate": 1e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7540484368801117, + "num_tokens": 415870513.0, + "step": 1444 + }, + { + "epoch": 0.5146927871772039, + "grad_norm": 0.7461907863616943, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.756824865937233, + "num_tokens": 416127329.0, + "step": 1445 + }, + { + "epoch": 0.5150489759572573, + "grad_norm": 0.6300566792488098, + "learning_rate": 1e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.7720569372177124, + "num_tokens": 416433137.0, + "step": 1446 + }, + { + "epoch": 0.5154051647373108, + "grad_norm": 0.7285535335540771, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7622507661581039, + "num_tokens": 416690920.0, + "step": 1447 + }, + { + "epoch": 0.5157613535173642, + "grad_norm": 0.6427170634269714, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7528639286756516, + "num_tokens": 417008846.0, + "step": 1448 + }, + { + "epoch": 0.5161175422974177, + "grad_norm": 0.7037660479545593, + "learning_rate": 1e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7520909607410431, + "num_tokens": 417271793.0, + "step": 1449 + }, + { + "epoch": 0.5164737310774711, + "grad_norm": 0.7082017064094543, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7463861256837845, + "num_tokens": 417537384.0, + "step": 1450 + }, + { + "epoch": 0.5168299198575245, + "grad_norm": 0.6384367346763611, + "learning_rate": 1e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7564508765935898, + "num_tokens": 417844575.0, + "step": 1451 + }, + { + "epoch": 0.517186108637578, + "grad_norm": 0.6483922004699707, + "learning_rate": 1e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.7710580080747604, + "num_tokens": 418156389.0, + "step": 1452 + }, + { + "epoch": 0.5175422974176314, + "grad_norm": 0.672046422958374, + "learning_rate": 1e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.787045493721962, + "num_tokens": 418429972.0, + "step": 1453 + }, + { + "epoch": 0.5178984861976847, + "grad_norm": 0.661260187625885, + "learning_rate": 1e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7532605081796646, + "num_tokens": 418711969.0, + "step": 1454 + }, + { + "epoch": 0.5182546749777382, + "grad_norm": 0.6553703546524048, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7420694828033447, + "num_tokens": 419015631.0, + "step": 1455 + }, + { + "epoch": 0.5186108637577916, + "grad_norm": 0.7461816072463989, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7449348717927933, + "num_tokens": 419274462.0, + "step": 1456 + }, + { + "epoch": 0.518967052537845, + "grad_norm": 0.6903891563415527, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7618819028139114, + "num_tokens": 419571595.0, + "step": 1457 + }, + { + "epoch": 0.5193232413178985, + "grad_norm": 0.7330911159515381, + "learning_rate": 1e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7702622562646866, + "num_tokens": 419856647.0, + "step": 1458 + }, + { + "epoch": 0.5196794300979519, + "grad_norm": 0.7026988863945007, + "learning_rate": 1e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7578598856925964, + "num_tokens": 420117154.0, + "step": 1459 + }, + { + "epoch": 0.5200356188780053, + "grad_norm": 0.6664429903030396, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7602083384990692, + "num_tokens": 420398331.0, + "step": 1460 + }, + { + "epoch": 0.5203918076580588, + "grad_norm": 0.7196788787841797, + "learning_rate": 1e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7543942630290985, + "num_tokens": 420666952.0, + "step": 1461 + }, + { + "epoch": 0.5207479964381122, + "grad_norm": 0.6982242465019226, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.748967781662941, + "num_tokens": 420925641.0, + "step": 1462 + }, + { + "epoch": 0.5211041852181656, + "grad_norm": 0.66214519739151, + "learning_rate": 1e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.777863547205925, + "num_tokens": 421215057.0, + "step": 1463 + }, + { + "epoch": 0.5214603739982191, + "grad_norm": 0.6664472222328186, + "learning_rate": 1e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7575951218605042, + "num_tokens": 421503331.0, + "step": 1464 + }, + { + "epoch": 0.5218165627782725, + "grad_norm": 0.6775723695755005, + "learning_rate": 1e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7620982080698013, + "num_tokens": 421797703.0, + "step": 1465 + }, + { + "epoch": 0.5221727515583259, + "grad_norm": 0.6630513668060303, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7425749152898788, + "num_tokens": 422110651.0, + "step": 1466 + }, + { + "epoch": 0.5225289403383794, + "grad_norm": 0.687972903251648, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7408843785524368, + "num_tokens": 422391714.0, + "step": 1467 + }, + { + "epoch": 0.5228851291184328, + "grad_norm": 0.7126241326332092, + "learning_rate": 1e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.7830757945775986, + "num_tokens": 422655910.0, + "step": 1468 + }, + { + "epoch": 0.5232413178984862, + "grad_norm": 0.6915825605392456, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7607764452695847, + "num_tokens": 422934652.0, + "step": 1469 + }, + { + "epoch": 0.5235975066785397, + "grad_norm": 0.6759036779403687, + "learning_rate": 1e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7637775987386703, + "num_tokens": 423239591.0, + "step": 1470 + }, + { + "epoch": 0.523953695458593, + "grad_norm": 0.6476072072982788, + "learning_rate": 1e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7574239373207092, + "num_tokens": 423564435.0, + "step": 1471 + }, + { + "epoch": 0.5243098842386464, + "grad_norm": 0.6850994229316711, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7648671269416809, + "num_tokens": 423833030.0, + "step": 1472 + }, + { + "epoch": 0.5246660730186999, + "grad_norm": 0.7138441205024719, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.764201283454895, + "num_tokens": 424135606.0, + "step": 1473 + }, + { + "epoch": 0.5250222617987533, + "grad_norm": 0.7076702117919922, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7570366710424423, + "num_tokens": 424403333.0, + "step": 1474 + }, + { + "epoch": 0.5253784505788067, + "grad_norm": 0.6953333020210266, + "learning_rate": 1e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7544546872377396, + "num_tokens": 424688309.0, + "step": 1475 + }, + { + "epoch": 0.5257346393588602, + "grad_norm": 0.6812365055084229, + "learning_rate": 1e-06, + "loss": 0.6783, + "mean_token_accuracy": 0.7826896607875824, + "num_tokens": 424986169.0, + "step": 1476 + }, + { + "epoch": 0.5260908281389136, + "grad_norm": 0.7084270119667053, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7482729107141495, + "num_tokens": 425258692.0, + "step": 1477 + }, + { + "epoch": 0.526447016918967, + "grad_norm": 0.6940047144889832, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7599569857120514, + "num_tokens": 425541298.0, + "step": 1478 + }, + { + "epoch": 0.5268032056990205, + "grad_norm": 0.7427356839179993, + "learning_rate": 1e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7659169286489487, + "num_tokens": 425797775.0, + "step": 1479 + }, + { + "epoch": 0.5271593944790739, + "grad_norm": 0.651579737663269, + "learning_rate": 1e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7604247182607651, + "num_tokens": 426125274.0, + "step": 1480 + }, + { + "epoch": 0.5275155832591273, + "grad_norm": 0.6583978533744812, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7353017181158066, + "num_tokens": 426425567.0, + "step": 1481 + }, + { + "epoch": 0.5278717720391808, + "grad_norm": 0.726600706577301, + "learning_rate": 1e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7526037395000458, + "num_tokens": 426691440.0, + "step": 1482 + }, + { + "epoch": 0.5282279608192342, + "grad_norm": 0.7372222542762756, + "learning_rate": 1e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7535540908575058, + "num_tokens": 426967289.0, + "step": 1483 + }, + { + "epoch": 0.5285841495992876, + "grad_norm": 0.721373975276947, + "learning_rate": 1e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7406535595655441, + "num_tokens": 427231557.0, + "step": 1484 + }, + { + "epoch": 0.5289403383793411, + "grad_norm": 0.6732959747314453, + "learning_rate": 1e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7713770568370819, + "num_tokens": 427524172.0, + "step": 1485 + }, + { + "epoch": 0.5292965271593945, + "grad_norm": 0.6733496785163879, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7549264281988144, + "num_tokens": 427800947.0, + "step": 1486 + }, + { + "epoch": 0.529652715939448, + "grad_norm": 0.710117518901825, + "learning_rate": 1e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7535181194543839, + "num_tokens": 428056601.0, + "step": 1487 + }, + { + "epoch": 0.5300089047195014, + "grad_norm": 0.7111957669258118, + "learning_rate": 1e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7534040212631226, + "num_tokens": 428331741.0, + "step": 1488 + }, + { + "epoch": 0.5303650934995547, + "grad_norm": 0.7214725613594055, + "learning_rate": 1e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7477433532476425, + "num_tokens": 428607542.0, + "step": 1489 + }, + { + "epoch": 0.5307212822796082, + "grad_norm": 0.6481271386146545, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7457434386014938, + "num_tokens": 428948191.0, + "step": 1490 + }, + { + "epoch": 0.5310774710596616, + "grad_norm": 0.6433812975883484, + "learning_rate": 1e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.7634780257940292, + "num_tokens": 429262222.0, + "step": 1491 + }, + { + "epoch": 0.531433659839715, + "grad_norm": 0.6297028064727783, + "learning_rate": 1e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7757818400859833, + "num_tokens": 429590366.0, + "step": 1492 + }, + { + "epoch": 0.5317898486197685, + "grad_norm": 0.6898259520530701, + "learning_rate": 1e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7475043535232544, + "num_tokens": 429852817.0, + "step": 1493 + }, + { + "epoch": 0.5321460373998219, + "grad_norm": 0.6847946643829346, + "learning_rate": 1e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7724843323230743, + "num_tokens": 430137307.0, + "step": 1494 + }, + { + "epoch": 0.5325022261798753, + "grad_norm": 0.6803114414215088, + "learning_rate": 1e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7653586566448212, + "num_tokens": 430418678.0, + "step": 1495 + }, + { + "epoch": 0.5328584149599288, + "grad_norm": 0.6536359786987305, + "learning_rate": 1e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7717448472976685, + "num_tokens": 430734923.0, + "step": 1496 + }, + { + "epoch": 0.5332146037399822, + "grad_norm": 0.6719661951065063, + "learning_rate": 1e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7466239929199219, + "num_tokens": 431053698.0, + "step": 1497 + }, + { + "epoch": 0.5335707925200356, + "grad_norm": 0.7497108578681946, + "learning_rate": 1e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.7390056252479553, + "num_tokens": 431303708.0, + "step": 1498 + }, + { + "epoch": 0.5339269813000891, + "grad_norm": 0.6344057321548462, + "learning_rate": 1e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7694574743509293, + "num_tokens": 431636663.0, + "step": 1499 + }, + { + "epoch": 0.5342831700801425, + "grad_norm": 0.6702229380607605, + "learning_rate": 1e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.7787425220012665, + "num_tokens": 431922536.0, + "step": 1500 + }, + { + "epoch": 0.5346393588601959, + "grad_norm": 0.6614193320274353, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7532702684402466, + "num_tokens": 432220092.0, + "step": 1501 + }, + { + "epoch": 0.5349955476402494, + "grad_norm": 0.6736346483230591, + "learning_rate": 1e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7497788369655609, + "num_tokens": 432526999.0, + "step": 1502 + }, + { + "epoch": 0.5353517364203028, + "grad_norm": 0.6726891994476318, + "learning_rate": 1e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.748257040977478, + "num_tokens": 432821779.0, + "step": 1503 + }, + { + "epoch": 0.5357079252003561, + "grad_norm": 0.6495142579078674, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7518501132726669, + "num_tokens": 433118946.0, + "step": 1504 + }, + { + "epoch": 0.5360641139804097, + "grad_norm": 0.6909339427947998, + "learning_rate": 1e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.7527299374341965, + "num_tokens": 433410023.0, + "step": 1505 + }, + { + "epoch": 0.536420302760463, + "grad_norm": 0.6498630046844482, + "learning_rate": 1e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7676701694726944, + "num_tokens": 433724369.0, + "step": 1506 + }, + { + "epoch": 0.5367764915405164, + "grad_norm": 0.7106354236602783, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7673238962888718, + "num_tokens": 433996462.0, + "step": 1507 + }, + { + "epoch": 0.5371326803205699, + "grad_norm": 0.6970576047897339, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.757558599114418, + "num_tokens": 434266231.0, + "step": 1508 + }, + { + "epoch": 0.5374888691006233, + "grad_norm": 0.7159156203269958, + "learning_rate": 1e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7598247975111008, + "num_tokens": 434558135.0, + "step": 1509 + }, + { + "epoch": 0.5378450578806767, + "grad_norm": 0.6749306917190552, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.759274810552597, + "num_tokens": 434849630.0, + "step": 1510 + }, + { + "epoch": 0.5382012466607302, + "grad_norm": 0.6735959053039551, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7589712888002396, + "num_tokens": 435141572.0, + "step": 1511 + }, + { + "epoch": 0.5385574354407836, + "grad_norm": 0.717812180519104, + "learning_rate": 1e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7479575127363205, + "num_tokens": 435389148.0, + "step": 1512 + }, + { + "epoch": 0.538913624220837, + "grad_norm": 0.6940534114837646, + "learning_rate": 1e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.7566154152154922, + "num_tokens": 435677702.0, + "step": 1513 + }, + { + "epoch": 0.5392698130008905, + "grad_norm": 0.7310096025466919, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7466770708560944, + "num_tokens": 435936803.0, + "step": 1514 + }, + { + "epoch": 0.5396260017809439, + "grad_norm": 0.6889733076095581, + "learning_rate": 1e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7520657777786255, + "num_tokens": 436215436.0, + "step": 1515 + }, + { + "epoch": 0.5399821905609973, + "grad_norm": 0.6779581308364868, + "learning_rate": 1e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7564261853694916, + "num_tokens": 436512367.0, + "step": 1516 + }, + { + "epoch": 0.5403383793410508, + "grad_norm": 0.6696997880935669, + "learning_rate": 1e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.772243082523346, + "num_tokens": 436807417.0, + "step": 1517 + }, + { + "epoch": 0.5406945681211042, + "grad_norm": 0.6559615731239319, + "learning_rate": 1e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7576961070299149, + "num_tokens": 437117193.0, + "step": 1518 + }, + { + "epoch": 0.5410507569011576, + "grad_norm": 0.6745485067367554, + "learning_rate": 1e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7686493992805481, + "num_tokens": 437396312.0, + "step": 1519 + }, + { + "epoch": 0.5414069456812111, + "grad_norm": 0.6929957866668701, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7419531643390656, + "num_tokens": 437671277.0, + "step": 1520 + }, + { + "epoch": 0.5417631344612645, + "grad_norm": 0.7344437837600708, + "learning_rate": 1e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.7383136451244354, + "num_tokens": 437944430.0, + "step": 1521 + }, + { + "epoch": 0.542119323241318, + "grad_norm": 0.653208315372467, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7335592955350876, + "num_tokens": 438260144.0, + "step": 1522 + }, + { + "epoch": 0.5424755120213713, + "grad_norm": 0.6788403391838074, + "learning_rate": 1e-06, + "loss": 0.808, + "mean_token_accuracy": 0.7499231994152069, + "num_tokens": 438559396.0, + "step": 1523 + }, + { + "epoch": 0.5428317008014247, + "grad_norm": 0.6114411950111389, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7591837793588638, + "num_tokens": 438930374.0, + "step": 1524 + }, + { + "epoch": 0.5431878895814782, + "grad_norm": 0.6908756494522095, + "learning_rate": 1e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7726874649524689, + "num_tokens": 439211225.0, + "step": 1525 + }, + { + "epoch": 0.5435440783615316, + "grad_norm": 0.6947041749954224, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7393214255571365, + "num_tokens": 439510918.0, + "step": 1526 + }, + { + "epoch": 0.543900267141585, + "grad_norm": 0.7114976644515991, + "learning_rate": 1e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7522852122783661, + "num_tokens": 439808903.0, + "step": 1527 + }, + { + "epoch": 0.5442564559216385, + "grad_norm": 0.7961774468421936, + "learning_rate": 1e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7495100945234299, + "num_tokens": 440043736.0, + "step": 1528 + }, + { + "epoch": 0.5446126447016919, + "grad_norm": 0.6946681141853333, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7701802551746368, + "num_tokens": 440311638.0, + "step": 1529 + }, + { + "epoch": 0.5449688334817453, + "grad_norm": 0.7238008379936218, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7533843964338303, + "num_tokens": 440571625.0, + "step": 1530 + }, + { + "epoch": 0.5453250222617988, + "grad_norm": 0.7027267217636108, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7418109476566315, + "num_tokens": 440887813.0, + "step": 1531 + }, + { + "epoch": 0.5456812110418522, + "grad_norm": 0.7119235992431641, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7362840920686722, + "num_tokens": 441158368.0, + "step": 1532 + }, + { + "epoch": 0.5460373998219056, + "grad_norm": 0.6757463216781616, + "learning_rate": 1e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7600829899311066, + "num_tokens": 441439003.0, + "step": 1533 + }, + { + "epoch": 0.5463935886019591, + "grad_norm": 0.708584725856781, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.769260972738266, + "num_tokens": 441727161.0, + "step": 1534 + }, + { + "epoch": 0.5467497773820125, + "grad_norm": 0.6806308031082153, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7705401927232742, + "num_tokens": 442032926.0, + "step": 1535 + }, + { + "epoch": 0.5471059661620659, + "grad_norm": 0.6866256594657898, + "learning_rate": 1e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.7512576133012772, + "num_tokens": 442321756.0, + "step": 1536 + }, + { + "epoch": 0.5474621549421194, + "grad_norm": 0.7141119241714478, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7414729744195938, + "num_tokens": 442579183.0, + "step": 1537 + }, + { + "epoch": 0.5478183437221728, + "grad_norm": 0.715511679649353, + "learning_rate": 1e-06, + "loss": 0.7974, + "mean_token_accuracy": 0.7558271586894989, + "num_tokens": 442850965.0, + "step": 1538 + }, + { + "epoch": 0.5481745325022261, + "grad_norm": 0.688752293586731, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7577531486749649, + "num_tokens": 443144673.0, + "step": 1539 + }, + { + "epoch": 0.5485307212822796, + "grad_norm": 0.7914301753044128, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7425520122051239, + "num_tokens": 443427695.0, + "step": 1540 + }, + { + "epoch": 0.548886910062333, + "grad_norm": 0.7411672472953796, + "learning_rate": 1e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.7483720928430557, + "num_tokens": 443695238.0, + "step": 1541 + }, + { + "epoch": 0.5492430988423864, + "grad_norm": 0.6937350034713745, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7520532011985779, + "num_tokens": 443968030.0, + "step": 1542 + }, + { + "epoch": 0.5495992876224399, + "grad_norm": 0.7095234990119934, + "learning_rate": 1e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7479874640703201, + "num_tokens": 444241499.0, + "step": 1543 + }, + { + "epoch": 0.5499554764024933, + "grad_norm": 0.7266533970832825, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7415417432785034, + "num_tokens": 444489202.0, + "step": 1544 + }, + { + "epoch": 0.5503116651825467, + "grad_norm": 0.7626387476921082, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7462373077869415, + "num_tokens": 444721564.0, + "step": 1545 + }, + { + "epoch": 0.5506678539626002, + "grad_norm": 0.7408244609832764, + "learning_rate": 1e-06, + "loss": 0.7821, + "mean_token_accuracy": 0.7534405589103699, + "num_tokens": 444978912.0, + "step": 1546 + }, + { + "epoch": 0.5510240427426536, + "grad_norm": 0.7104193568229675, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7426908612251282, + "num_tokens": 445250877.0, + "step": 1547 + }, + { + "epoch": 0.551380231522707, + "grad_norm": 0.6660391688346863, + "learning_rate": 1e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.7559630274772644, + "num_tokens": 445530811.0, + "step": 1548 + }, + { + "epoch": 0.5517364203027605, + "grad_norm": 0.6548779606819153, + "learning_rate": 1e-06, + "loss": 0.768, + "mean_token_accuracy": 0.7574136108160019, + "num_tokens": 445823842.0, + "step": 1549 + }, + { + "epoch": 0.5520926090828139, + "grad_norm": 0.6861368417739868, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7501563429832458, + "num_tokens": 446099578.0, + "step": 1550 + }, + { + "epoch": 0.5524487978628673, + "grad_norm": 0.7220462560653687, + "learning_rate": 1e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.753923773765564, + "num_tokens": 446381031.0, + "step": 1551 + }, + { + "epoch": 0.5528049866429208, + "grad_norm": 0.7022643685340881, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7604591846466064, + "num_tokens": 446638142.0, + "step": 1552 + }, + { + "epoch": 0.5531611754229742, + "grad_norm": 0.7013078927993774, + "learning_rate": 1e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.7754212021827698, + "num_tokens": 446943608.0, + "step": 1553 + }, + { + "epoch": 0.5535173642030276, + "grad_norm": 0.6835857629776001, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.769231766462326, + "num_tokens": 447271986.0, + "step": 1554 + }, + { + "epoch": 0.553873552983081, + "grad_norm": 0.6570292711257935, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7498809546232224, + "num_tokens": 447590683.0, + "step": 1555 + }, + { + "epoch": 0.5542297417631344, + "grad_norm": 0.7039280533790588, + "learning_rate": 1e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7524091899394989, + "num_tokens": 447864025.0, + "step": 1556 + }, + { + "epoch": 0.5545859305431879, + "grad_norm": 0.6801185607910156, + "learning_rate": 1e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7550346106290817, + "num_tokens": 448186567.0, + "step": 1557 + }, + { + "epoch": 0.5549421193232413, + "grad_norm": 0.7260575294494629, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7507253289222717, + "num_tokens": 448461249.0, + "step": 1558 + }, + { + "epoch": 0.5552983081032947, + "grad_norm": 0.682009756565094, + "learning_rate": 1e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7480096369981766, + "num_tokens": 448763857.0, + "step": 1559 + }, + { + "epoch": 0.5556544968833482, + "grad_norm": 0.7054060101509094, + "learning_rate": 1e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7443902641534805, + "num_tokens": 449037397.0, + "step": 1560 + }, + { + "epoch": 0.5560106856634016, + "grad_norm": 0.620359480381012, + "learning_rate": 1e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7689460217952728, + "num_tokens": 449352791.0, + "step": 1561 + }, + { + "epoch": 0.556366874443455, + "grad_norm": 0.7295332551002502, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7495351284742355, + "num_tokens": 449662957.0, + "step": 1562 + }, + { + "epoch": 0.5567230632235085, + "grad_norm": 0.7037706971168518, + "learning_rate": 1e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7552017420530319, + "num_tokens": 449937753.0, + "step": 1563 + }, + { + "epoch": 0.5570792520035619, + "grad_norm": 0.6755703687667847, + "learning_rate": 1e-06, + "loss": 0.7206, + "mean_token_accuracy": 0.7677937299013138, + "num_tokens": 450253587.0, + "step": 1564 + }, + { + "epoch": 0.5574354407836153, + "grad_norm": 0.7302614450454712, + "learning_rate": 1e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7579736709594727, + "num_tokens": 450548374.0, + "step": 1565 + }, + { + "epoch": 0.5577916295636688, + "grad_norm": 0.6700519919395447, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7498626112937927, + "num_tokens": 450856395.0, + "step": 1566 + }, + { + "epoch": 0.5581478183437222, + "grad_norm": 0.6933807134628296, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7291596829891205, + "num_tokens": 451141164.0, + "step": 1567 + }, + { + "epoch": 0.5585040071237756, + "grad_norm": 0.6258497834205627, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7596900761127472, + "num_tokens": 451450176.0, + "step": 1568 + }, + { + "epoch": 0.5588601959038291, + "grad_norm": 0.7062546610832214, + "learning_rate": 1e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7761981636285782, + "num_tokens": 451725957.0, + "step": 1569 + }, + { + "epoch": 0.5592163846838825, + "grad_norm": 0.7490414381027222, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7571186423301697, + "num_tokens": 451994411.0, + "step": 1570 + }, + { + "epoch": 0.5595725734639359, + "grad_norm": 0.6906751990318298, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.758009597659111, + "num_tokens": 452299364.0, + "step": 1571 + }, + { + "epoch": 0.5599287622439894, + "grad_norm": 0.6652560830116272, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7603357285261154, + "num_tokens": 452580709.0, + "step": 1572 + }, + { + "epoch": 0.5602849510240427, + "grad_norm": 0.7313030362129211, + "learning_rate": 1e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.7464917898178101, + "num_tokens": 452841933.0, + "step": 1573 + }, + { + "epoch": 0.5606411398040961, + "grad_norm": 0.7480945587158203, + "learning_rate": 1e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7663114964962006, + "num_tokens": 453073993.0, + "step": 1574 + }, + { + "epoch": 0.5609973285841496, + "grad_norm": 0.6909177303314209, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7468160837888718, + "num_tokens": 453334208.0, + "step": 1575 + }, + { + "epoch": 0.561353517364203, + "grad_norm": 0.7359760999679565, + "learning_rate": 1e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7458634823560715, + "num_tokens": 453588486.0, + "step": 1576 + }, + { + "epoch": 0.5617097061442564, + "grad_norm": 0.6783101558685303, + "learning_rate": 1e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7652082145214081, + "num_tokens": 453885641.0, + "step": 1577 + }, + { + "epoch": 0.5620658949243099, + "grad_norm": 0.7044235467910767, + "learning_rate": 1e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7609682679176331, + "num_tokens": 454155106.0, + "step": 1578 + }, + { + "epoch": 0.5624220837043633, + "grad_norm": 0.7019275426864624, + "learning_rate": 1e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7586607486009598, + "num_tokens": 454431092.0, + "step": 1579 + }, + { + "epoch": 0.5627782724844167, + "grad_norm": 0.6587383151054382, + "learning_rate": 1e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7528807669878006, + "num_tokens": 454729612.0, + "step": 1580 + }, + { + "epoch": 0.5631344612644702, + "grad_norm": 0.6545379161834717, + "learning_rate": 1e-06, + "loss": 0.7578, + "mean_token_accuracy": 0.7587550431489944, + "num_tokens": 455027132.0, + "step": 1581 + }, + { + "epoch": 0.5634906500445236, + "grad_norm": 0.6430779099464417, + "learning_rate": 1e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7631281614303589, + "num_tokens": 455330952.0, + "step": 1582 + }, + { + "epoch": 0.563846838824577, + "grad_norm": 0.6687248349189758, + "learning_rate": 1e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7759902775287628, + "num_tokens": 455609827.0, + "step": 1583 + }, + { + "epoch": 0.5642030276046305, + "grad_norm": 0.7477811574935913, + "learning_rate": 1e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.754266157746315, + "num_tokens": 455867699.0, + "step": 1584 + }, + { + "epoch": 0.5645592163846839, + "grad_norm": 0.6729284524917603, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7684850543737411, + "num_tokens": 456181392.0, + "step": 1585 + }, + { + "epoch": 0.5649154051647373, + "grad_norm": 0.6704213619232178, + "learning_rate": 1e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7586458474397659, + "num_tokens": 456462572.0, + "step": 1586 + }, + { + "epoch": 0.5652715939447908, + "grad_norm": 0.6432669758796692, + "learning_rate": 1e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7754636406898499, + "num_tokens": 456767257.0, + "step": 1587 + }, + { + "epoch": 0.5656277827248442, + "grad_norm": 0.6851404905319214, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7601483166217804, + "num_tokens": 457058633.0, + "step": 1588 + }, + { + "epoch": 0.5659839715048975, + "grad_norm": 0.712519645690918, + "learning_rate": 1e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.7569452673196793, + "num_tokens": 457342973.0, + "step": 1589 + }, + { + "epoch": 0.566340160284951, + "grad_norm": 0.653653621673584, + "learning_rate": 1e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7532462924718857, + "num_tokens": 457625184.0, + "step": 1590 + }, + { + "epoch": 0.5666963490650044, + "grad_norm": 0.6737965941429138, + "learning_rate": 1e-06, + "loss": 0.8113, + "mean_token_accuracy": 0.7487844675779343, + "num_tokens": 457915214.0, + "step": 1591 + }, + { + "epoch": 0.5670525378450579, + "grad_norm": 0.6558749079704285, + "learning_rate": 1e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7436233311891556, + "num_tokens": 458184507.0, + "step": 1592 + }, + { + "epoch": 0.5674087266251113, + "grad_norm": 0.7121096849441528, + "learning_rate": 1e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.74049311876297, + "num_tokens": 458458168.0, + "step": 1593 + }, + { + "epoch": 0.5677649154051647, + "grad_norm": 0.7192794680595398, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7522018104791641, + "num_tokens": 458742095.0, + "step": 1594 + }, + { + "epoch": 0.5681211041852182, + "grad_norm": 0.7074292898178101, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7585836797952652, + "num_tokens": 459015420.0, + "step": 1595 + }, + { + "epoch": 0.5684772929652716, + "grad_norm": 0.7035124897956848, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7425711005926132, + "num_tokens": 459283235.0, + "step": 1596 + }, + { + "epoch": 0.568833481745325, + "grad_norm": 0.7321388125419617, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7417465448379517, + "num_tokens": 459556149.0, + "step": 1597 + }, + { + "epoch": 0.5691896705253785, + "grad_norm": 0.7197571992874146, + "learning_rate": 1e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7604738771915436, + "num_tokens": 459824037.0, + "step": 1598 + }, + { + "epoch": 0.5695458593054319, + "grad_norm": 0.7267109155654907, + "learning_rate": 1e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.766941100358963, + "num_tokens": 460075261.0, + "step": 1599 + }, + { + "epoch": 0.5699020480854853, + "grad_norm": 0.7570378184318542, + "learning_rate": 1e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7559410631656647, + "num_tokens": 460325346.0, + "step": 1600 + }, + { + "epoch": 0.5702582368655388, + "grad_norm": 0.7155254483222961, + "learning_rate": 1e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.7528364658355713, + "num_tokens": 460615013.0, + "step": 1601 + }, + { + "epoch": 0.5706144256455922, + "grad_norm": 0.6592368483543396, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.750335082411766, + "num_tokens": 460900700.0, + "step": 1602 + }, + { + "epoch": 0.5709706144256456, + "grad_norm": 0.7447939515113831, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7462705671787262, + "num_tokens": 461149503.0, + "step": 1603 + }, + { + "epoch": 0.5713268032056991, + "grad_norm": 0.6899593472480774, + "learning_rate": 1e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.7686243206262589, + "num_tokens": 461446111.0, + "step": 1604 + }, + { + "epoch": 0.5716829919857525, + "grad_norm": 0.6921606659889221, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7482394725084305, + "num_tokens": 461731231.0, + "step": 1605 + }, + { + "epoch": 0.5720391807658058, + "grad_norm": 0.6909536719322205, + "learning_rate": 1e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7489177882671356, + "num_tokens": 461996589.0, + "step": 1606 + }, + { + "epoch": 0.5723953695458593, + "grad_norm": 0.6582622528076172, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7740426957607269, + "num_tokens": 462299961.0, + "step": 1607 + }, + { + "epoch": 0.5727515583259127, + "grad_norm": 0.7023988366127014, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7699137479066849, + "num_tokens": 462571837.0, + "step": 1608 + }, + { + "epoch": 0.5731077471059661, + "grad_norm": 0.6839088201522827, + "learning_rate": 1e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.745259627699852, + "num_tokens": 462857115.0, + "step": 1609 + }, + { + "epoch": 0.5734639358860196, + "grad_norm": 0.6940921545028687, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7480878680944443, + "num_tokens": 463128013.0, + "step": 1610 + }, + { + "epoch": 0.573820124666073, + "grad_norm": 0.7176468968391418, + "learning_rate": 1e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7533196657896042, + "num_tokens": 463398746.0, + "step": 1611 + }, + { + "epoch": 0.5741763134461264, + "grad_norm": 0.6807556748390198, + "learning_rate": 1e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7704791128635406, + "num_tokens": 463686984.0, + "step": 1612 + }, + { + "epoch": 0.5745325022261799, + "grad_norm": 0.7118833065032959, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7454826533794403, + "num_tokens": 463966763.0, + "step": 1613 + }, + { + "epoch": 0.5748886910062333, + "grad_norm": 0.6613655686378479, + "learning_rate": 1e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7593540549278259, + "num_tokens": 464270177.0, + "step": 1614 + }, + { + "epoch": 0.5752448797862867, + "grad_norm": 0.7149350643157959, + "learning_rate": 1e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7600640058517456, + "num_tokens": 464553805.0, + "step": 1615 + }, + { + "epoch": 0.5756010685663402, + "grad_norm": 0.6840316653251648, + "learning_rate": 1e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7583485394716263, + "num_tokens": 464863425.0, + "step": 1616 + }, + { + "epoch": 0.5759572573463936, + "grad_norm": 0.6407206058502197, + "learning_rate": 1e-06, + "loss": 0.6779, + "mean_token_accuracy": 0.786345511674881, + "num_tokens": 465168324.0, + "step": 1617 + }, + { + "epoch": 0.576313446126447, + "grad_norm": 0.6370583176612854, + "learning_rate": 1e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7694699913263321, + "num_tokens": 465475759.0, + "step": 1618 + }, + { + "epoch": 0.5766696349065005, + "grad_norm": 0.6975677013397217, + "learning_rate": 1e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7565760761499405, + "num_tokens": 465779638.0, + "step": 1619 + }, + { + "epoch": 0.5770258236865539, + "grad_norm": 0.6869992613792419, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7381137609481812, + "num_tokens": 466079644.0, + "step": 1620 + }, + { + "epoch": 0.5773820124666073, + "grad_norm": 0.7326099872589111, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7412468791007996, + "num_tokens": 466354639.0, + "step": 1621 + }, + { + "epoch": 0.5777382012466608, + "grad_norm": 0.7066259980201721, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7647463530302048, + "num_tokens": 466630661.0, + "step": 1622 + }, + { + "epoch": 0.5780943900267141, + "grad_norm": 0.7102968692779541, + "learning_rate": 1e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7543385326862335, + "num_tokens": 466892597.0, + "step": 1623 + }, + { + "epoch": 0.5784505788067675, + "grad_norm": 0.6883658766746521, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7315827459096909, + "num_tokens": 467195911.0, + "step": 1624 + }, + { + "epoch": 0.578806767586821, + "grad_norm": 0.7556871175765991, + "learning_rate": 1e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.7473396956920624, + "num_tokens": 467490967.0, + "step": 1625 + }, + { + "epoch": 0.5791629563668744, + "grad_norm": 0.6644706130027771, + "learning_rate": 1e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.761175662279129, + "num_tokens": 467775130.0, + "step": 1626 + }, + { + "epoch": 0.5795191451469278, + "grad_norm": 1.5083198547363281, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7602347582578659, + "num_tokens": 468033850.0, + "step": 1627 + }, + { + "epoch": 0.5798753339269813, + "grad_norm": 0.7405851483345032, + "learning_rate": 1e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7477159649133682, + "num_tokens": 468305314.0, + "step": 1628 + }, + { + "epoch": 0.5802315227070347, + "grad_norm": 0.6960292458534241, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7721590399742126, + "num_tokens": 468576762.0, + "step": 1629 + }, + { + "epoch": 0.5805877114870882, + "grad_norm": 0.7263566255569458, + "learning_rate": 1e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.7581649422645569, + "num_tokens": 468832616.0, + "step": 1630 + }, + { + "epoch": 0.5809439002671416, + "grad_norm": 0.6371350884437561, + "learning_rate": 1e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7788739949464798, + "num_tokens": 469130341.0, + "step": 1631 + }, + { + "epoch": 0.581300089047195, + "grad_norm": 0.6608961224555969, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7556357234716415, + "num_tokens": 469431631.0, + "step": 1632 + }, + { + "epoch": 0.5816562778272485, + "grad_norm": 0.7173916101455688, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7384002357721329, + "num_tokens": 469707810.0, + "step": 1633 + }, + { + "epoch": 0.5820124666073019, + "grad_norm": 0.7125408053398132, + "learning_rate": 1e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.758104681968689, + "num_tokens": 469972829.0, + "step": 1634 + }, + { + "epoch": 0.5823686553873553, + "grad_norm": 0.693537712097168, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7613609731197357, + "num_tokens": 470261487.0, + "step": 1635 + }, + { + "epoch": 0.5827248441674088, + "grad_norm": 0.6727031469345093, + "learning_rate": 1e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7723237127065659, + "num_tokens": 470542855.0, + "step": 1636 + }, + { + "epoch": 0.5830810329474622, + "grad_norm": 0.6788088083267212, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7584013491868973, + "num_tokens": 470826172.0, + "step": 1637 + }, + { + "epoch": 0.5834372217275156, + "grad_norm": 0.6692792177200317, + "learning_rate": 1e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7498494982719421, + "num_tokens": 471099593.0, + "step": 1638 + }, + { + "epoch": 0.583793410507569, + "grad_norm": 0.6996212601661682, + "learning_rate": 1e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7694244682788849, + "num_tokens": 471390673.0, + "step": 1639 + }, + { + "epoch": 0.5841495992876224, + "grad_norm": 0.6895626187324524, + "learning_rate": 1e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.7537928521633148, + "num_tokens": 471670412.0, + "step": 1640 + }, + { + "epoch": 0.5845057880676758, + "grad_norm": 0.676317036151886, + "learning_rate": 1e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.755195826292038, + "num_tokens": 471949126.0, + "step": 1641 + }, + { + "epoch": 0.5848619768477293, + "grad_norm": 0.6535193920135498, + "learning_rate": 1e-06, + "loss": 0.8067, + "mean_token_accuracy": 0.7490397095680237, + "num_tokens": 472248679.0, + "step": 1642 + }, + { + "epoch": 0.5852181656277827, + "grad_norm": 0.6575945615768433, + "learning_rate": 1e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7683083862066269, + "num_tokens": 472535039.0, + "step": 1643 + }, + { + "epoch": 0.5855743544078361, + "grad_norm": 0.7003069519996643, + "learning_rate": 1e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7613403797149658, + "num_tokens": 472831856.0, + "step": 1644 + }, + { + "epoch": 0.5859305431878896, + "grad_norm": 0.6821804046630859, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7364382743835449, + "num_tokens": 473131057.0, + "step": 1645 + }, + { + "epoch": 0.586286731967943, + "grad_norm": 0.711052656173706, + "learning_rate": 1e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7532916069030762, + "num_tokens": 473411961.0, + "step": 1646 + }, + { + "epoch": 0.5866429207479964, + "grad_norm": 0.7374253869056702, + "learning_rate": 1e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7638296186923981, + "num_tokens": 473703093.0, + "step": 1647 + }, + { + "epoch": 0.5869991095280499, + "grad_norm": 0.6740974187850952, + "learning_rate": 1e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7549728453159332, + "num_tokens": 474015249.0, + "step": 1648 + }, + { + "epoch": 0.5873552983081033, + "grad_norm": 0.672715961933136, + "learning_rate": 1e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7645120322704315, + "num_tokens": 474302825.0, + "step": 1649 + }, + { + "epoch": 0.5877114870881567, + "grad_norm": 0.6929161548614502, + "learning_rate": 1e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.7671017050743103, + "num_tokens": 474577515.0, + "step": 1650 + }, + { + "epoch": 0.5880676758682102, + "grad_norm": 0.7047606706619263, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7539904564619064, + "num_tokens": 474867085.0, + "step": 1651 + }, + { + "epoch": 0.5884238646482636, + "grad_norm": 0.6950231790542603, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7496300041675568, + "num_tokens": 475167857.0, + "step": 1652 + }, + { + "epoch": 0.588780053428317, + "grad_norm": 0.7122836709022522, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7425461262464523, + "num_tokens": 475457095.0, + "step": 1653 + }, + { + "epoch": 0.5891362422083705, + "grad_norm": 0.6594682931900024, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7577188909053802, + "num_tokens": 475767024.0, + "step": 1654 + }, + { + "epoch": 0.5894924309884239, + "grad_norm": 0.6666103601455688, + "learning_rate": 1e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7545740008354187, + "num_tokens": 476070502.0, + "step": 1655 + }, + { + "epoch": 0.5898486197684772, + "grad_norm": 0.6813030242919922, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7378323823213577, + "num_tokens": 476357367.0, + "step": 1656 + }, + { + "epoch": 0.5902048085485307, + "grad_norm": 0.65237957239151, + "learning_rate": 1e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7620687633752823, + "num_tokens": 476658609.0, + "step": 1657 + }, + { + "epoch": 0.5905609973285841, + "grad_norm": 0.6712964773178101, + "learning_rate": 1e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7666117697954178, + "num_tokens": 476959863.0, + "step": 1658 + }, + { + "epoch": 0.5909171861086375, + "grad_norm": 0.7354835271835327, + "learning_rate": 1e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.746214359998703, + "num_tokens": 477253460.0, + "step": 1659 + }, + { + "epoch": 0.591273374888691, + "grad_norm": 0.7623605132102966, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7515738308429718, + "num_tokens": 477491294.0, + "step": 1660 + }, + { + "epoch": 0.5916295636687444, + "grad_norm": 0.7649231553077698, + "learning_rate": 1e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7476779669523239, + "num_tokens": 477731693.0, + "step": 1661 + }, + { + "epoch": 0.5919857524487978, + "grad_norm": 0.6439259648323059, + "learning_rate": 1e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7615954279899597, + "num_tokens": 478067867.0, + "step": 1662 + }, + { + "epoch": 0.5923419412288513, + "grad_norm": 0.6878642439842224, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.745199590921402, + "num_tokens": 478372063.0, + "step": 1663 + }, + { + "epoch": 0.5926981300089047, + "grad_norm": 0.697557806968689, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7384065836668015, + "num_tokens": 478694678.0, + "step": 1664 + }, + { + "epoch": 0.5930543187889582, + "grad_norm": 0.7068673968315125, + "learning_rate": 1e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.75042524933815, + "num_tokens": 478969447.0, + "step": 1665 + }, + { + "epoch": 0.5934105075690116, + "grad_norm": 0.7215568423271179, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7308874130249023, + "num_tokens": 479219571.0, + "step": 1666 + }, + { + "epoch": 0.593766696349065, + "grad_norm": 0.697297990322113, + "learning_rate": 1e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7467194497585297, + "num_tokens": 479491534.0, + "step": 1667 + }, + { + "epoch": 0.5941228851291185, + "grad_norm": 0.6736904382705688, + "learning_rate": 1e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7621931731700897, + "num_tokens": 479787189.0, + "step": 1668 + }, + { + "epoch": 0.5944790739091719, + "grad_norm": 0.6767702698707581, + "learning_rate": 1e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7611201256513596, + "num_tokens": 480103374.0, + "step": 1669 + }, + { + "epoch": 0.5948352626892253, + "grad_norm": 0.6861124634742737, + "learning_rate": 1e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7596408575773239, + "num_tokens": 480372543.0, + "step": 1670 + }, + { + "epoch": 0.5951914514692788, + "grad_norm": 0.7462039589881897, + "learning_rate": 1e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7424971908330917, + "num_tokens": 480635318.0, + "step": 1671 + }, + { + "epoch": 0.5955476402493322, + "grad_norm": 0.680420994758606, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7642297893762589, + "num_tokens": 480920997.0, + "step": 1672 + }, + { + "epoch": 0.5959038290293855, + "grad_norm": 0.6909247636795044, + "learning_rate": 1e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7474656701087952, + "num_tokens": 481194250.0, + "step": 1673 + }, + { + "epoch": 0.596260017809439, + "grad_norm": 0.6689778566360474, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7486626356840134, + "num_tokens": 481483281.0, + "step": 1674 + }, + { + "epoch": 0.5966162065894924, + "grad_norm": 0.6747426986694336, + "learning_rate": 1e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7605278193950653, + "num_tokens": 481780565.0, + "step": 1675 + }, + { + "epoch": 0.5969723953695458, + "grad_norm": 0.7066935300827026, + "learning_rate": 1e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7570203542709351, + "num_tokens": 482048150.0, + "step": 1676 + }, + { + "epoch": 0.5973285841495993, + "grad_norm": 0.6710976958274841, + "learning_rate": 1e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7683971077203751, + "num_tokens": 482340623.0, + "step": 1677 + }, + { + "epoch": 0.5976847729296527, + "grad_norm": 0.6826766133308411, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7707574367523193, + "num_tokens": 482642870.0, + "step": 1678 + }, + { + "epoch": 0.5980409617097061, + "grad_norm": 0.7053518891334534, + "learning_rate": 1e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7546874284744263, + "num_tokens": 482913412.0, + "step": 1679 + }, + { + "epoch": 0.5983971504897596, + "grad_norm": 0.7164046168327332, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7625271081924438, + "num_tokens": 483175039.0, + "step": 1680 + }, + { + "epoch": 0.598753339269813, + "grad_norm": 0.7122494578361511, + "learning_rate": 1e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7613331377506256, + "num_tokens": 483426010.0, + "step": 1681 + }, + { + "epoch": 0.5991095280498664, + "grad_norm": 0.7134946584701538, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7706554979085922, + "num_tokens": 483725549.0, + "step": 1682 + }, + { + "epoch": 0.5994657168299199, + "grad_norm": 0.7043372392654419, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7518906146287918, + "num_tokens": 483993384.0, + "step": 1683 + }, + { + "epoch": 0.5998219056099733, + "grad_norm": 0.6929348111152649, + "learning_rate": 1e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7622651606798172, + "num_tokens": 484275857.0, + "step": 1684 + }, + { + "epoch": 0.6001780943900267, + "grad_norm": 0.7072084546089172, + "learning_rate": 1e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7543046176433563, + "num_tokens": 484552405.0, + "step": 1685 + }, + { + "epoch": 0.6005342831700802, + "grad_norm": 0.6861827373504639, + "learning_rate": 1e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7792967855930328, + "num_tokens": 484841963.0, + "step": 1686 + }, + { + "epoch": 0.6008904719501336, + "grad_norm": 0.6515093445777893, + "learning_rate": 1e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7545286118984222, + "num_tokens": 485162800.0, + "step": 1687 + }, + { + "epoch": 0.601246660730187, + "grad_norm": 0.6896921396255493, + "learning_rate": 1e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7618377506732941, + "num_tokens": 485439456.0, + "step": 1688 + }, + { + "epoch": 0.6016028495102405, + "grad_norm": 0.6405121684074402, + "learning_rate": 1e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.756773516535759, + "num_tokens": 485763434.0, + "step": 1689 + }, + { + "epoch": 0.6019590382902938, + "grad_norm": 0.687899649143219, + "learning_rate": 1e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7596358358860016, + "num_tokens": 486047834.0, + "step": 1690 + }, + { + "epoch": 0.6023152270703472, + "grad_norm": 0.717074990272522, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7641862779855728, + "num_tokens": 486299877.0, + "step": 1691 + }, + { + "epoch": 0.6026714158504007, + "grad_norm": 0.7384796142578125, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7621385902166367, + "num_tokens": 486561394.0, + "step": 1692 + }, + { + "epoch": 0.6030276046304541, + "grad_norm": 0.7053343653678894, + "learning_rate": 1e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7626385688781738, + "num_tokens": 486878337.0, + "step": 1693 + }, + { + "epoch": 0.6033837934105075, + "grad_norm": 0.6918609738349915, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7392792999744415, + "num_tokens": 487170967.0, + "step": 1694 + }, + { + "epoch": 0.603739982190561, + "grad_norm": 0.6899123191833496, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7583145350217819, + "num_tokens": 487462812.0, + "step": 1695 + }, + { + "epoch": 0.6040961709706144, + "grad_norm": 0.6955690979957581, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7677481472492218, + "num_tokens": 487762908.0, + "step": 1696 + }, + { + "epoch": 0.6044523597506678, + "grad_norm": 0.7509586811065674, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.764551430940628, + "num_tokens": 488023015.0, + "step": 1697 + }, + { + "epoch": 0.6048085485307213, + "grad_norm": 0.6617805361747742, + "learning_rate": 1e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7632321864366531, + "num_tokens": 488327266.0, + "step": 1698 + }, + { + "epoch": 0.6051647373107747, + "grad_norm": 0.7314428687095642, + "learning_rate": 1e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7534418404102325, + "num_tokens": 488600666.0, + "step": 1699 + }, + { + "epoch": 0.6055209260908282, + "grad_norm": 0.7185302972793579, + "learning_rate": 1e-06, + "loss": 0.6518, + "mean_token_accuracy": 0.7874864488840103, + "num_tokens": 488895487.0, + "step": 1700 + }, + { + "epoch": 0.6058771148708816, + "grad_norm": 0.6969638466835022, + "learning_rate": 1e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7619389593601227, + "num_tokens": 489181257.0, + "step": 1701 + }, + { + "epoch": 0.606233303650935, + "grad_norm": 0.6907720565795898, + "learning_rate": 1e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7570504695177078, + "num_tokens": 489485575.0, + "step": 1702 + }, + { + "epoch": 0.6065894924309885, + "grad_norm": 0.6971386671066284, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7582043558359146, + "num_tokens": 489768839.0, + "step": 1703 + }, + { + "epoch": 0.6069456812110419, + "grad_norm": 0.6910429000854492, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7433193773031235, + "num_tokens": 490060044.0, + "step": 1704 + }, + { + "epoch": 0.6073018699910953, + "grad_norm": 0.6693165302276611, + "learning_rate": 1e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.762890413403511, + "num_tokens": 490336851.0, + "step": 1705 + }, + { + "epoch": 0.6076580587711488, + "grad_norm": 0.7091513872146606, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7238338440656662, + "num_tokens": 490613010.0, + "step": 1706 + }, + { + "epoch": 0.6080142475512021, + "grad_norm": 0.7035695314407349, + "learning_rate": 1e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.75307796895504, + "num_tokens": 490913726.0, + "step": 1707 + }, + { + "epoch": 0.6083704363312555, + "grad_norm": 0.685778021812439, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7468729019165039, + "num_tokens": 491196189.0, + "step": 1708 + }, + { + "epoch": 0.608726625111309, + "grad_norm": 0.6714504361152649, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7489855289459229, + "num_tokens": 491488860.0, + "step": 1709 + }, + { + "epoch": 0.6090828138913624, + "grad_norm": 0.6968784928321838, + "learning_rate": 1e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.7779334038496017, + "num_tokens": 491773106.0, + "step": 1710 + }, + { + "epoch": 0.6094390026714158, + "grad_norm": 0.6878619194030762, + "learning_rate": 1e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7572395205497742, + "num_tokens": 492079566.0, + "step": 1711 + }, + { + "epoch": 0.6097951914514693, + "grad_norm": 0.7511493563652039, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.75351881980896, + "num_tokens": 492356821.0, + "step": 1712 + }, + { + "epoch": 0.6101513802315227, + "grad_norm": 0.7278934717178345, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.74567511677742, + "num_tokens": 492641214.0, + "step": 1713 + }, + { + "epoch": 0.6105075690115761, + "grad_norm": 0.6828153729438782, + "learning_rate": 1e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.745887503027916, + "num_tokens": 492944049.0, + "step": 1714 + }, + { + "epoch": 0.6108637577916296, + "grad_norm": 0.6501777768135071, + "learning_rate": 1e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7698768526315689, + "num_tokens": 493259274.0, + "step": 1715 + }, + { + "epoch": 0.611219946571683, + "grad_norm": 0.7574165463447571, + "learning_rate": 1e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.758803129196167, + "num_tokens": 493521356.0, + "step": 1716 + }, + { + "epoch": 0.6115761353517364, + "grad_norm": 0.6963102221488953, + "learning_rate": 1e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.7516181021928787, + "num_tokens": 493822046.0, + "step": 1717 + }, + { + "epoch": 0.6119323241317899, + "grad_norm": 0.6928281188011169, + "learning_rate": 1e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7824086993932724, + "num_tokens": 494116198.0, + "step": 1718 + }, + { + "epoch": 0.6122885129118433, + "grad_norm": 0.6521168351173401, + "learning_rate": 1e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7638057172298431, + "num_tokens": 494438049.0, + "step": 1719 + }, + { + "epoch": 0.6126447016918967, + "grad_norm": 0.6655930280685425, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7546397298574448, + "num_tokens": 494729298.0, + "step": 1720 + }, + { + "epoch": 0.6130008904719502, + "grad_norm": 0.7064603567123413, + "learning_rate": 1e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7718293070793152, + "num_tokens": 495024930.0, + "step": 1721 + }, + { + "epoch": 0.6133570792520036, + "grad_norm": 0.7195597290992737, + "learning_rate": 1e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7562415301799774, + "num_tokens": 495315521.0, + "step": 1722 + }, + { + "epoch": 0.613713268032057, + "grad_norm": 0.646294355392456, + "learning_rate": 1e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7613352537155151, + "num_tokens": 495601639.0, + "step": 1723 + }, + { + "epoch": 0.6140694568121104, + "grad_norm": 0.7736311554908752, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7427951544523239, + "num_tokens": 495854172.0, + "step": 1724 + }, + { + "epoch": 0.6144256455921638, + "grad_norm": 0.7612985372543335, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.754918247461319, + "num_tokens": 496119463.0, + "step": 1725 + }, + { + "epoch": 0.6147818343722172, + "grad_norm": 0.6645041704177856, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7584660053253174, + "num_tokens": 496417960.0, + "step": 1726 + }, + { + "epoch": 0.6151380231522707, + "grad_norm": 0.6588650941848755, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7604922503232956, + "num_tokens": 496731507.0, + "step": 1727 + }, + { + "epoch": 0.6154942119323241, + "grad_norm": 0.7208360433578491, + "learning_rate": 1e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7483381032943726, + "num_tokens": 496987715.0, + "step": 1728 + }, + { + "epoch": 0.6158504007123775, + "grad_norm": 0.6745775938034058, + "learning_rate": 1e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7512947469949722, + "num_tokens": 497266143.0, + "step": 1729 + }, + { + "epoch": 0.616206589492431, + "grad_norm": 0.6963668465614319, + "learning_rate": 1e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7636212557554245, + "num_tokens": 497534631.0, + "step": 1730 + }, + { + "epoch": 0.6165627782724844, + "grad_norm": 0.7256969213485718, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7424382865428925, + "num_tokens": 497806894.0, + "step": 1731 + }, + { + "epoch": 0.6169189670525378, + "grad_norm": 0.6418361663818359, + "learning_rate": 1e-06, + "loss": 0.7449, + "mean_token_accuracy": 0.7621777653694153, + "num_tokens": 498105005.0, + "step": 1732 + }, + { + "epoch": 0.6172751558325913, + "grad_norm": 0.6963315010070801, + "learning_rate": 1e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7472109943628311, + "num_tokens": 498384347.0, + "step": 1733 + }, + { + "epoch": 0.6176313446126447, + "grad_norm": 0.6821424961090088, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7462018877267838, + "num_tokens": 498665692.0, + "step": 1734 + }, + { + "epoch": 0.6179875333926982, + "grad_norm": 0.6718099117279053, + "learning_rate": 1e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7538799196481705, + "num_tokens": 498956080.0, + "step": 1735 + }, + { + "epoch": 0.6183437221727516, + "grad_norm": 0.7174425721168518, + "learning_rate": 1e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7551645636558533, + "num_tokens": 499217302.0, + "step": 1736 + }, + { + "epoch": 0.618699910952805, + "grad_norm": 0.6808143854141235, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7603513300418854, + "num_tokens": 499503172.0, + "step": 1737 + }, + { + "epoch": 0.6190560997328585, + "grad_norm": 0.6580443382263184, + "learning_rate": 1e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.76399165391922, + "num_tokens": 499818501.0, + "step": 1738 + }, + { + "epoch": 0.6194122885129119, + "grad_norm": 0.688873827457428, + "learning_rate": 1e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7496615648269653, + "num_tokens": 500088672.0, + "step": 1739 + }, + { + "epoch": 0.6197684772929652, + "grad_norm": 0.6640873551368713, + "learning_rate": 1e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7612507492303848, + "num_tokens": 500367884.0, + "step": 1740 + }, + { + "epoch": 0.6201246660730187, + "grad_norm": 0.6584781408309937, + "learning_rate": 1e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7583357840776443, + "num_tokens": 500671467.0, + "step": 1741 + }, + { + "epoch": 0.6204808548530721, + "grad_norm": 0.7282753586769104, + "learning_rate": 1e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7508539110422134, + "num_tokens": 500930856.0, + "step": 1742 + }, + { + "epoch": 0.6208370436331255, + "grad_norm": 0.6781667470932007, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7521840333938599, + "num_tokens": 501216249.0, + "step": 1743 + }, + { + "epoch": 0.621193232413179, + "grad_norm": 0.6811519265174866, + "learning_rate": 1e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7637189030647278, + "num_tokens": 501500464.0, + "step": 1744 + }, + { + "epoch": 0.6215494211932324, + "grad_norm": 0.7107496857643127, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7593941390514374, + "num_tokens": 501777363.0, + "step": 1745 + }, + { + "epoch": 0.6219056099732858, + "grad_norm": 0.6206598281860352, + "learning_rate": 1e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7638285458087921, + "num_tokens": 502098552.0, + "step": 1746 + }, + { + "epoch": 0.6222617987533393, + "grad_norm": 0.6435748934745789, + "learning_rate": 1e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7513782680034637, + "num_tokens": 502402104.0, + "step": 1747 + }, + { + "epoch": 0.6226179875333927, + "grad_norm": 0.6864527463912964, + "learning_rate": 1e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7661651074886322, + "num_tokens": 502674050.0, + "step": 1748 + }, + { + "epoch": 0.6229741763134461, + "grad_norm": 0.6802321672439575, + "learning_rate": 1e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.757027730345726, + "num_tokens": 502971817.0, + "step": 1749 + }, + { + "epoch": 0.6233303650934996, + "grad_norm": 0.6615080833435059, + "learning_rate": 1e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7718997448682785, + "num_tokens": 503275974.0, + "step": 1750 + }, + { + "epoch": 0.623686553873553, + "grad_norm": 0.6740484833717346, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7677161991596222, + "num_tokens": 503559524.0, + "step": 1751 + }, + { + "epoch": 0.6240427426536064, + "grad_norm": 0.6790525913238525, + "learning_rate": 1e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7638582587242126, + "num_tokens": 503869962.0, + "step": 1752 + }, + { + "epoch": 0.6243989314336599, + "grad_norm": 0.7562258243560791, + "learning_rate": 1e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7490924149751663, + "num_tokens": 504094374.0, + "step": 1753 + }, + { + "epoch": 0.6247551202137133, + "grad_norm": 0.7082468867301941, + "learning_rate": 1e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7451618313789368, + "num_tokens": 504383887.0, + "step": 1754 + }, + { + "epoch": 0.6251113089937667, + "grad_norm": 0.6710649132728577, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7576379925012589, + "num_tokens": 504673035.0, + "step": 1755 + }, + { + "epoch": 0.6254674977738202, + "grad_norm": 0.6733608245849609, + "learning_rate": 1e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.760494589805603, + "num_tokens": 504967068.0, + "step": 1756 + }, + { + "epoch": 0.6258236865538735, + "grad_norm": 0.6935293078422546, + "learning_rate": 1e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7517351359128952, + "num_tokens": 505235051.0, + "step": 1757 + }, + { + "epoch": 0.6261798753339269, + "grad_norm": 0.6679185628890991, + "learning_rate": 1e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7528282105922699, + "num_tokens": 505540892.0, + "step": 1758 + }, + { + "epoch": 0.6265360641139804, + "grad_norm": 0.7001211047172546, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7577172070741653, + "num_tokens": 505818786.0, + "step": 1759 + }, + { + "epoch": 0.6268922528940338, + "grad_norm": 0.6913585066795349, + "learning_rate": 1e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7532365620136261, + "num_tokens": 506111754.0, + "step": 1760 + }, + { + "epoch": 0.6272484416740872, + "grad_norm": 0.6701805591583252, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7498683780431747, + "num_tokens": 506440734.0, + "step": 1761 + }, + { + "epoch": 0.6276046304541407, + "grad_norm": 0.7326372265815735, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7362087965011597, + "num_tokens": 506731386.0, + "step": 1762 + }, + { + "epoch": 0.6279608192341941, + "grad_norm": 0.6459636092185974, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7706732451915741, + "num_tokens": 507042653.0, + "step": 1763 + }, + { + "epoch": 0.6283170080142475, + "grad_norm": 0.6838931441307068, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7391607761383057, + "num_tokens": 507344703.0, + "step": 1764 + }, + { + "epoch": 0.628673196794301, + "grad_norm": 0.7099730968475342, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7399801164865494, + "num_tokens": 507632384.0, + "step": 1765 + }, + { + "epoch": 0.6290293855743544, + "grad_norm": 0.6250625848770142, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7766125798225403, + "num_tokens": 507949843.0, + "step": 1766 + }, + { + "epoch": 0.6293855743544078, + "grad_norm": 0.7023468613624573, + "learning_rate": 1e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7716576904058456, + "num_tokens": 508214725.0, + "step": 1767 + }, + { + "epoch": 0.6297417631344613, + "grad_norm": 0.7093809843063354, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7698123604059219, + "num_tokens": 508486394.0, + "step": 1768 + }, + { + "epoch": 0.6300979519145147, + "grad_norm": 0.6932886838912964, + "learning_rate": 1e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7496252059936523, + "num_tokens": 508766777.0, + "step": 1769 + }, + { + "epoch": 0.6304541406945681, + "grad_norm": 0.7054506540298462, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7710015177726746, + "num_tokens": 509052727.0, + "step": 1770 + }, + { + "epoch": 0.6308103294746216, + "grad_norm": 0.6344043016433716, + "learning_rate": 1e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7730852216482162, + "num_tokens": 509359162.0, + "step": 1771 + }, + { + "epoch": 0.631166518254675, + "grad_norm": 0.6711231470108032, + "learning_rate": 1e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7762398421764374, + "num_tokens": 509671806.0, + "step": 1772 + }, + { + "epoch": 0.6315227070347285, + "grad_norm": 0.7378241419792175, + "learning_rate": 1e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7627194821834564, + "num_tokens": 509947303.0, + "step": 1773 + }, + { + "epoch": 0.6318788958147818, + "grad_norm": 0.717319667339325, + "learning_rate": 1e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7495546936988831, + "num_tokens": 510252557.0, + "step": 1774 + }, + { + "epoch": 0.6322350845948352, + "grad_norm": 0.6644170880317688, + "learning_rate": 1e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7666420191526413, + "num_tokens": 510544599.0, + "step": 1775 + }, + { + "epoch": 0.6325912733748887, + "grad_norm": 0.746849775314331, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7452043443918228, + "num_tokens": 510805659.0, + "step": 1776 + }, + { + "epoch": 0.6329474621549421, + "grad_norm": 0.7298145890235901, + "learning_rate": 1e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7733044475317001, + "num_tokens": 511105254.0, + "step": 1777 + }, + { + "epoch": 0.6333036509349955, + "grad_norm": 0.719068706035614, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7674673497676849, + "num_tokens": 511378583.0, + "step": 1778 + }, + { + "epoch": 0.633659839715049, + "grad_norm": 0.6914927363395691, + "learning_rate": 1e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.761151060461998, + "num_tokens": 511681906.0, + "step": 1779 + }, + { + "epoch": 0.6340160284951024, + "grad_norm": 0.7088025212287903, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7446519881486893, + "num_tokens": 511959951.0, + "step": 1780 + }, + { + "epoch": 0.6343722172751558, + "grad_norm": 0.7428579926490784, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.759508028626442, + "num_tokens": 512227303.0, + "step": 1781 + }, + { + "epoch": 0.6347284060552093, + "grad_norm": 0.7128955721855164, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7579114884138107, + "num_tokens": 512492799.0, + "step": 1782 + }, + { + "epoch": 0.6350845948352627, + "grad_norm": 0.6659238338470459, + "learning_rate": 1e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7580128014087677, + "num_tokens": 512810590.0, + "step": 1783 + }, + { + "epoch": 0.6354407836153161, + "grad_norm": 0.7361065149307251, + "learning_rate": 1e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.754693478345871, + "num_tokens": 513082457.0, + "step": 1784 + }, + { + "epoch": 0.6357969723953696, + "grad_norm": 0.698357343673706, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7628153562545776, + "num_tokens": 513376979.0, + "step": 1785 + }, + { + "epoch": 0.636153161175423, + "grad_norm": 0.6891907453536987, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7611078023910522, + "num_tokens": 513651372.0, + "step": 1786 + }, + { + "epoch": 0.6365093499554764, + "grad_norm": 0.7332305312156677, + "learning_rate": 1e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.769699215888977, + "num_tokens": 513889601.0, + "step": 1787 + }, + { + "epoch": 0.6368655387355299, + "grad_norm": 0.693753719329834, + "learning_rate": 1e-06, + "loss": 0.7262, + "mean_token_accuracy": 0.7678313553333282, + "num_tokens": 514181447.0, + "step": 1788 + }, + { + "epoch": 0.6372217275155833, + "grad_norm": 0.6862697005271912, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.741718128323555, + "num_tokens": 514467005.0, + "step": 1789 + }, + { + "epoch": 0.6375779162956366, + "grad_norm": 0.7183038592338562, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7494857311248779, + "num_tokens": 514744839.0, + "step": 1790 + }, + { + "epoch": 0.6379341050756901, + "grad_norm": 0.6768918633460999, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7411160618066788, + "num_tokens": 515049692.0, + "step": 1791 + }, + { + "epoch": 0.6382902938557435, + "grad_norm": 0.7644667029380798, + "learning_rate": 1e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7546076625585556, + "num_tokens": 515296264.0, + "step": 1792 + }, + { + "epoch": 0.6386464826357969, + "grad_norm": 0.6878671646118164, + "learning_rate": 1e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7522113919258118, + "num_tokens": 515563775.0, + "step": 1793 + }, + { + "epoch": 0.6390026714158504, + "grad_norm": 0.7132260799407959, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7637520879507065, + "num_tokens": 515842911.0, + "step": 1794 + }, + { + "epoch": 0.6393588601959038, + "grad_norm": 0.6555992364883423, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7645918279886246, + "num_tokens": 516142133.0, + "step": 1795 + }, + { + "epoch": 0.6397150489759572, + "grad_norm": 0.7180405855178833, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7519588768482208, + "num_tokens": 516433376.0, + "step": 1796 + }, + { + "epoch": 0.6400712377560107, + "grad_norm": 0.7665255069732666, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7664418667554855, + "num_tokens": 516687294.0, + "step": 1797 + }, + { + "epoch": 0.6404274265360641, + "grad_norm": 0.6970040202140808, + "learning_rate": 1e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7556150853633881, + "num_tokens": 516976938.0, + "step": 1798 + }, + { + "epoch": 0.6407836153161175, + "grad_norm": 0.6960156559944153, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7500888109207153, + "num_tokens": 517281359.0, + "step": 1799 + }, + { + "epoch": 0.641139804096171, + "grad_norm": 0.7064265012741089, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.764258936047554, + "num_tokens": 517550476.0, + "step": 1800 + }, + { + "epoch": 0.6414959928762244, + "grad_norm": 0.6588308811187744, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7636685967445374, + "num_tokens": 517851727.0, + "step": 1801 + }, + { + "epoch": 0.6418521816562778, + "grad_norm": 0.7146672010421753, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7549308389425278, + "num_tokens": 518132056.0, + "step": 1802 + }, + { + "epoch": 0.6422083704363313, + "grad_norm": 0.7052079439163208, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7572005689144135, + "num_tokens": 518443227.0, + "step": 1803 + }, + { + "epoch": 0.6425645592163847, + "grad_norm": 0.67722088098526, + "learning_rate": 1e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7637623995542526, + "num_tokens": 518743348.0, + "step": 1804 + }, + { + "epoch": 0.6429207479964381, + "grad_norm": 0.6616809964179993, + "learning_rate": 1e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7596268206834793, + "num_tokens": 519038573.0, + "step": 1805 + }, + { + "epoch": 0.6432769367764916, + "grad_norm": 0.619471549987793, + "learning_rate": 1e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7627650201320648, + "num_tokens": 519367443.0, + "step": 1806 + }, + { + "epoch": 0.643633125556545, + "grad_norm": 0.7236934304237366, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.756719559431076, + "num_tokens": 519652696.0, + "step": 1807 + }, + { + "epoch": 0.6439893143365985, + "grad_norm": 0.7042493224143982, + "learning_rate": 1e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.7780864089727402, + "num_tokens": 519926949.0, + "step": 1808 + }, + { + "epoch": 0.6443455031166518, + "grad_norm": 0.6667414903640747, + "learning_rate": 1e-06, + "loss": 0.7498, + "mean_token_accuracy": 0.7644105404615402, + "num_tokens": 520221452.0, + "step": 1809 + }, + { + "epoch": 0.6447016918967052, + "grad_norm": 0.7006555795669556, + "learning_rate": 1e-06, + "loss": 0.6821, + "mean_token_accuracy": 0.7738779187202454, + "num_tokens": 520478727.0, + "step": 1810 + }, + { + "epoch": 0.6450578806767587, + "grad_norm": 0.7383959889411926, + "learning_rate": 1e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.749495193362236, + "num_tokens": 520739114.0, + "step": 1811 + }, + { + "epoch": 0.6454140694568121, + "grad_norm": 0.6933789253234863, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7431515455245972, + "num_tokens": 521034942.0, + "step": 1812 + }, + { + "epoch": 0.6457702582368655, + "grad_norm": 2.814211845397949, + "learning_rate": 1e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7769379615783691, + "num_tokens": 521354216.0, + "step": 1813 + }, + { + "epoch": 0.646126447016919, + "grad_norm": 0.6928390860557556, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7679993957281113, + "num_tokens": 521638249.0, + "step": 1814 + }, + { + "epoch": 0.6464826357969724, + "grad_norm": 0.7176980376243591, + "learning_rate": 1e-06, + "loss": 0.74, + "mean_token_accuracy": 0.767915815114975, + "num_tokens": 521905379.0, + "step": 1815 + }, + { + "epoch": 0.6468388245770258, + "grad_norm": 0.6669657230377197, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7296735793352127, + "num_tokens": 522210304.0, + "step": 1816 + }, + { + "epoch": 0.6471950133570793, + "grad_norm": 0.7221991419792175, + "learning_rate": 1e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7669887393712997, + "num_tokens": 522476954.0, + "step": 1817 + }, + { + "epoch": 0.6475512021371327, + "grad_norm": 0.7518717646598816, + "learning_rate": 1e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7663061320781708, + "num_tokens": 522761923.0, + "step": 1818 + }, + { + "epoch": 0.6479073909171861, + "grad_norm": 0.6709536910057068, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7709971815347672, + "num_tokens": 523043097.0, + "step": 1819 + }, + { + "epoch": 0.6482635796972396, + "grad_norm": 0.6700830459594727, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7604598850011826, + "num_tokens": 523349616.0, + "step": 1820 + }, + { + "epoch": 0.648619768477293, + "grad_norm": 0.7356614470481873, + "learning_rate": 1e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7472017705440521, + "num_tokens": 523588930.0, + "step": 1821 + }, + { + "epoch": 0.6489759572573464, + "grad_norm": 0.6425631642341614, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7579367309808731, + "num_tokens": 523893386.0, + "step": 1822 + }, + { + "epoch": 0.6493321460373999, + "grad_norm": 0.686040461063385, + "learning_rate": 1e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7694131135940552, + "num_tokens": 524152060.0, + "step": 1823 + }, + { + "epoch": 0.6496883348174533, + "grad_norm": 0.7122654318809509, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7599347829818726, + "num_tokens": 524442782.0, + "step": 1824 + }, + { + "epoch": 0.6500445235975066, + "grad_norm": 0.6876553297042847, + "learning_rate": 1e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7702081799507141, + "num_tokens": 524729658.0, + "step": 1825 + }, + { + "epoch": 0.6504007123775601, + "grad_norm": 0.6752170324325562, + "learning_rate": 1e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7461092621088028, + "num_tokens": 525027408.0, + "step": 1826 + }, + { + "epoch": 0.6507569011576135, + "grad_norm": 0.7210555076599121, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7677012979984283, + "num_tokens": 525289086.0, + "step": 1827 + }, + { + "epoch": 0.6511130899376669, + "grad_norm": 0.6928371787071228, + "learning_rate": 1e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7557150274515152, + "num_tokens": 525566811.0, + "step": 1828 + }, + { + "epoch": 0.6514692787177204, + "grad_norm": 0.7258584499359131, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7673310786485672, + "num_tokens": 525825999.0, + "step": 1829 + }, + { + "epoch": 0.6518254674977738, + "grad_norm": 0.6849112510681152, + "learning_rate": 1e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.742307648062706, + "num_tokens": 526102586.0, + "step": 1830 + }, + { + "epoch": 0.6521816562778272, + "grad_norm": 0.7156695127487183, + "learning_rate": 1e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.744685560464859, + "num_tokens": 526382663.0, + "step": 1831 + }, + { + "epoch": 0.6525378450578807, + "grad_norm": 0.6829181909561157, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7716548144817352, + "num_tokens": 526697590.0, + "step": 1832 + }, + { + "epoch": 0.6528940338379341, + "grad_norm": 0.7194775342941284, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7521026879549026, + "num_tokens": 526952242.0, + "step": 1833 + }, + { + "epoch": 0.6532502226179875, + "grad_norm": 0.6826184988021851, + "learning_rate": 1e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7677672058343887, + "num_tokens": 527241821.0, + "step": 1834 + }, + { + "epoch": 0.653606411398041, + "grad_norm": 0.6802335977554321, + "learning_rate": 1e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7736080586910248, + "num_tokens": 527532225.0, + "step": 1835 + }, + { + "epoch": 0.6539626001780944, + "grad_norm": 0.7198996543884277, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7443413585424423, + "num_tokens": 527788643.0, + "step": 1836 + }, + { + "epoch": 0.6543187889581478, + "grad_norm": 0.6870719790458679, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7446089833974838, + "num_tokens": 528088117.0, + "step": 1837 + }, + { + "epoch": 0.6546749777382013, + "grad_norm": 0.6917216777801514, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7466326653957367, + "num_tokens": 528381112.0, + "step": 1838 + }, + { + "epoch": 0.6550311665182547, + "grad_norm": 0.7102321982383728, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7354242503643036, + "num_tokens": 528634483.0, + "step": 1839 + }, + { + "epoch": 0.655387355298308, + "grad_norm": 0.7612544894218445, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.764597550034523, + "num_tokens": 528886379.0, + "step": 1840 + }, + { + "epoch": 0.6557435440783616, + "grad_norm": 0.6804913282394409, + "learning_rate": 1e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7659043818712234, + "num_tokens": 529161452.0, + "step": 1841 + }, + { + "epoch": 0.6560997328584149, + "grad_norm": 0.6858248114585876, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7652646154165268, + "num_tokens": 529430616.0, + "step": 1842 + }, + { + "epoch": 0.6564559216384684, + "grad_norm": 0.6901335716247559, + "learning_rate": 1e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7468603551387787, + "num_tokens": 529712743.0, + "step": 1843 + }, + { + "epoch": 0.6568121104185218, + "grad_norm": 0.7565010786056519, + "learning_rate": 1e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7622515112161636, + "num_tokens": 529973762.0, + "step": 1844 + }, + { + "epoch": 0.6571682991985752, + "grad_norm": 0.7796660661697388, + "learning_rate": 1e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.760289803147316, + "num_tokens": 530201846.0, + "step": 1845 + }, + { + "epoch": 0.6575244879786287, + "grad_norm": 0.6657456159591675, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7765278071165085, + "num_tokens": 530503930.0, + "step": 1846 + }, + { + "epoch": 0.6578806767586821, + "grad_norm": 0.7188791036605835, + "learning_rate": 1e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7505992203950882, + "num_tokens": 530776273.0, + "step": 1847 + }, + { + "epoch": 0.6582368655387355, + "grad_norm": 0.7154449820518494, + "learning_rate": 1e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7606364190578461, + "num_tokens": 531012893.0, + "step": 1848 + }, + { + "epoch": 0.658593054318789, + "grad_norm": 0.6860668659210205, + "learning_rate": 1e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7478230893611908, + "num_tokens": 531301614.0, + "step": 1849 + }, + { + "epoch": 0.6589492430988424, + "grad_norm": 0.6702057719230652, + "learning_rate": 1e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7741749286651611, + "num_tokens": 531609677.0, + "step": 1850 + }, + { + "epoch": 0.6593054318788958, + "grad_norm": 0.6607875823974609, + "learning_rate": 1e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.7854603379964828, + "num_tokens": 531926038.0, + "step": 1851 + }, + { + "epoch": 0.6596616206589493, + "grad_norm": 0.6960235238075256, + "learning_rate": 1e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7473741769790649, + "num_tokens": 532183525.0, + "step": 1852 + }, + { + "epoch": 0.6600178094390027, + "grad_norm": 0.6983980536460876, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7580018043518066, + "num_tokens": 532482209.0, + "step": 1853 + }, + { + "epoch": 0.6603739982190561, + "grad_norm": 0.6285166144371033, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7504656463861465, + "num_tokens": 532831040.0, + "step": 1854 + }, + { + "epoch": 0.6607301869991096, + "grad_norm": 0.6962078809738159, + "learning_rate": 1e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7583657652139664, + "num_tokens": 533142950.0, + "step": 1855 + }, + { + "epoch": 0.661086375779163, + "grad_norm": 0.694612979888916, + "learning_rate": 1e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7717425376176834, + "num_tokens": 533421949.0, + "step": 1856 + }, + { + "epoch": 0.6614425645592164, + "grad_norm": 0.7097789645195007, + "learning_rate": 1e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7649855464696884, + "num_tokens": 533689680.0, + "step": 1857 + }, + { + "epoch": 0.6617987533392699, + "grad_norm": 0.7100849151611328, + "learning_rate": 1e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7573982924222946, + "num_tokens": 533952679.0, + "step": 1858 + }, + { + "epoch": 0.6621549421193232, + "grad_norm": 0.6307033896446228, + "learning_rate": 1e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.762213870882988, + "num_tokens": 534269747.0, + "step": 1859 + }, + { + "epoch": 0.6625111308993766, + "grad_norm": 0.7119840383529663, + "learning_rate": 1e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7466669082641602, + "num_tokens": 534540837.0, + "step": 1860 + }, + { + "epoch": 0.6628673196794301, + "grad_norm": 0.6920320987701416, + "learning_rate": 1e-06, + "loss": 0.7878, + "mean_token_accuracy": 0.748995915055275, + "num_tokens": 534823093.0, + "step": 1861 + }, + { + "epoch": 0.6632235084594835, + "grad_norm": 0.6584218144416809, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7507133185863495, + "num_tokens": 535121174.0, + "step": 1862 + }, + { + "epoch": 0.6635796972395369, + "grad_norm": 0.6781086325645447, + "learning_rate": 1e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7503534257411957, + "num_tokens": 535415768.0, + "step": 1863 + }, + { + "epoch": 0.6639358860195904, + "grad_norm": 0.7021344304084778, + "learning_rate": 1e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7552514523267746, + "num_tokens": 535719527.0, + "step": 1864 + }, + { + "epoch": 0.6642920747996438, + "grad_norm": 0.641114354133606, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7457084953784943, + "num_tokens": 536067649.0, + "step": 1865 + }, + { + "epoch": 0.6646482635796972, + "grad_norm": 0.7102373242378235, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7346048057079315, + "num_tokens": 536347157.0, + "step": 1866 + }, + { + "epoch": 0.6650044523597507, + "grad_norm": 0.6817806959152222, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7554149180650711, + "num_tokens": 536629390.0, + "step": 1867 + }, + { + "epoch": 0.6653606411398041, + "grad_norm": 0.6936845183372498, + "learning_rate": 1e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.7490464448928833, + "num_tokens": 536935529.0, + "step": 1868 + }, + { + "epoch": 0.6657168299198575, + "grad_norm": 0.7320157289505005, + "learning_rate": 1e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7446599900722504, + "num_tokens": 537209757.0, + "step": 1869 + }, + { + "epoch": 0.666073018699911, + "grad_norm": 0.7067034244537354, + "learning_rate": 1e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7495013177394867, + "num_tokens": 537497660.0, + "step": 1870 + }, + { + "epoch": 0.6664292074799644, + "grad_norm": 0.7472741007804871, + "learning_rate": 1e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7530842125415802, + "num_tokens": 537764914.0, + "step": 1871 + }, + { + "epoch": 0.6667853962600178, + "grad_norm": 0.6711294651031494, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7582037150859833, + "num_tokens": 538086342.0, + "step": 1872 + }, + { + "epoch": 0.6671415850400713, + "grad_norm": 0.7127112150192261, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7528895884752274, + "num_tokens": 538396831.0, + "step": 1873 + }, + { + "epoch": 0.6674977738201247, + "grad_norm": 0.7335346341133118, + "learning_rate": 1e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7445068061351776, + "num_tokens": 538664321.0, + "step": 1874 + }, + { + "epoch": 0.667853962600178, + "grad_norm": 0.7325490117073059, + "learning_rate": 1e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7590707838535309, + "num_tokens": 538937581.0, + "step": 1875 + }, + { + "epoch": 0.6682101513802315, + "grad_norm": 0.719024121761322, + "learning_rate": 1e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7506338804960251, + "num_tokens": 539232067.0, + "step": 1876 + }, + { + "epoch": 0.6685663401602849, + "grad_norm": 0.7320206165313721, + "learning_rate": 1e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7709451466798782, + "num_tokens": 539514032.0, + "step": 1877 + }, + { + "epoch": 0.6689225289403384, + "grad_norm": 0.6713582277297974, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.749646320939064, + "num_tokens": 539801163.0, + "step": 1878 + }, + { + "epoch": 0.6692787177203918, + "grad_norm": 0.7537110447883606, + "learning_rate": 1e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7665046155452728, + "num_tokens": 540070430.0, + "step": 1879 + }, + { + "epoch": 0.6696349065004452, + "grad_norm": 0.6548277735710144, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7624891400337219, + "num_tokens": 540419682.0, + "step": 1880 + }, + { + "epoch": 0.6699910952804987, + "grad_norm": 0.6466699838638306, + "learning_rate": 1e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7648080438375473, + "num_tokens": 540731724.0, + "step": 1881 + }, + { + "epoch": 0.6703472840605521, + "grad_norm": 0.6823621988296509, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7539390027523041, + "num_tokens": 541026656.0, + "step": 1882 + }, + { + "epoch": 0.6707034728406055, + "grad_norm": 0.7732957601547241, + "learning_rate": 1e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7689783722162247, + "num_tokens": 541277629.0, + "step": 1883 + }, + { + "epoch": 0.671059661620659, + "grad_norm": 0.6696116924285889, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7636442929506302, + "num_tokens": 541585327.0, + "step": 1884 + }, + { + "epoch": 0.6714158504007124, + "grad_norm": 0.7106956243515015, + "learning_rate": 1e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7603705376386642, + "num_tokens": 541853319.0, + "step": 1885 + }, + { + "epoch": 0.6717720391807658, + "grad_norm": 0.7180782556533813, + "learning_rate": 1e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7453117519617081, + "num_tokens": 542116251.0, + "step": 1886 + }, + { + "epoch": 0.6721282279608193, + "grad_norm": 0.7007905840873718, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7464391440153122, + "num_tokens": 542406421.0, + "step": 1887 + }, + { + "epoch": 0.6724844167408727, + "grad_norm": 0.70463627576828, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7436519414186478, + "num_tokens": 542672837.0, + "step": 1888 + }, + { + "epoch": 0.6728406055209261, + "grad_norm": 0.7237235903739929, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7576453536748886, + "num_tokens": 542930066.0, + "step": 1889 + }, + { + "epoch": 0.6731967943009796, + "grad_norm": 0.6484046578407288, + "learning_rate": 1e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7738863676786423, + "num_tokens": 543232939.0, + "step": 1890 + }, + { + "epoch": 0.673552983081033, + "grad_norm": 0.7545703649520874, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7550286799669266, + "num_tokens": 543480474.0, + "step": 1891 + }, + { + "epoch": 0.6739091718610863, + "grad_norm": 0.6712193489074707, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7559445947408676, + "num_tokens": 543750813.0, + "step": 1892 + }, + { + "epoch": 0.6742653606411398, + "grad_norm": 0.6778460741043091, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7424838989973068, + "num_tokens": 544045114.0, + "step": 1893 + }, + { + "epoch": 0.6746215494211932, + "grad_norm": 0.6538885831832886, + "learning_rate": 1e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7773860841989517, + "num_tokens": 544364999.0, + "step": 1894 + }, + { + "epoch": 0.6749777382012466, + "grad_norm": 0.6929061412811279, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7536479979753494, + "num_tokens": 544641268.0, + "step": 1895 + }, + { + "epoch": 0.6753339269813001, + "grad_norm": 0.6811999082565308, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7586745917797089, + "num_tokens": 544926859.0, + "step": 1896 + }, + { + "epoch": 0.6756901157613535, + "grad_norm": 0.7391690611839294, + "learning_rate": 1e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7497228384017944, + "num_tokens": 545171631.0, + "step": 1897 + }, + { + "epoch": 0.6760463045414069, + "grad_norm": 0.6459920406341553, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7514657229185104, + "num_tokens": 545484852.0, + "step": 1898 + }, + { + "epoch": 0.6764024933214604, + "grad_norm": 0.6551792025566101, + "learning_rate": 1e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7573139071464539, + "num_tokens": 545785087.0, + "step": 1899 + }, + { + "epoch": 0.6767586821015138, + "grad_norm": 0.691848874092102, + "learning_rate": 1e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.765732541680336, + "num_tokens": 546069341.0, + "step": 1900 + }, + { + "epoch": 0.6771148708815672, + "grad_norm": 0.6925395131111145, + "learning_rate": 1e-06, + "loss": 0.707, + "mean_token_accuracy": 0.773474931716919, + "num_tokens": 546337316.0, + "step": 1901 + }, + { + "epoch": 0.6774710596616207, + "grad_norm": 0.6820418238639832, + "learning_rate": 1e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7759462893009186, + "num_tokens": 546603244.0, + "step": 1902 + }, + { + "epoch": 0.6778272484416741, + "grad_norm": 0.6720638871192932, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.761791467666626, + "num_tokens": 546895041.0, + "step": 1903 + }, + { + "epoch": 0.6781834372217275, + "grad_norm": 0.6777287125587463, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7577344477176666, + "num_tokens": 547173589.0, + "step": 1904 + }, + { + "epoch": 0.678539626001781, + "grad_norm": 0.6480311155319214, + "learning_rate": 1e-06, + "loss": 0.6886, + "mean_token_accuracy": 0.7789741605520248, + "num_tokens": 547486756.0, + "step": 1905 + }, + { + "epoch": 0.6788958147818344, + "grad_norm": 0.6634626388549805, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7531055361032486, + "num_tokens": 547781210.0, + "step": 1906 + }, + { + "epoch": 0.6792520035618878, + "grad_norm": 0.6678534746170044, + "learning_rate": 1e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.758016899228096, + "num_tokens": 548081274.0, + "step": 1907 + }, + { + "epoch": 0.6796081923419413, + "grad_norm": 0.7547051906585693, + "learning_rate": 1e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7472656518220901, + "num_tokens": 548345793.0, + "step": 1908 + }, + { + "epoch": 0.6799643811219946, + "grad_norm": 0.7128207087516785, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7601193785667419, + "num_tokens": 548609295.0, + "step": 1909 + }, + { + "epoch": 0.680320569902048, + "grad_norm": 0.6714663505554199, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7442234754562378, + "num_tokens": 548893564.0, + "step": 1910 + }, + { + "epoch": 0.6806767586821015, + "grad_norm": 0.7424867153167725, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7470041662454605, + "num_tokens": 549191116.0, + "step": 1911 + }, + { + "epoch": 0.6810329474621549, + "grad_norm": 0.7469907999038696, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7491241246461868, + "num_tokens": 549466927.0, + "step": 1912 + }, + { + "epoch": 0.6813891362422083, + "grad_norm": 0.7332457900047302, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7448129206895828, + "num_tokens": 549745683.0, + "step": 1913 + }, + { + "epoch": 0.6817453250222618, + "grad_norm": 0.6282956004142761, + "learning_rate": 1e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7696951627731323, + "num_tokens": 550056880.0, + "step": 1914 + }, + { + "epoch": 0.6821015138023152, + "grad_norm": 0.6646697521209717, + "learning_rate": 1e-06, + "loss": 0.6829, + "mean_token_accuracy": 0.7803360223770142, + "num_tokens": 550336353.0, + "step": 1915 + }, + { + "epoch": 0.6824577025823687, + "grad_norm": 0.6717336177825928, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7303353250026703, + "num_tokens": 550619706.0, + "step": 1916 + }, + { + "epoch": 0.6828138913624221, + "grad_norm": 0.7147629261016846, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7447956651449203, + "num_tokens": 550877863.0, + "step": 1917 + }, + { + "epoch": 0.6831700801424755, + "grad_norm": 0.6381688714027405, + "learning_rate": 1e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7451864182949066, + "num_tokens": 551181004.0, + "step": 1918 + }, + { + "epoch": 0.683526268922529, + "grad_norm": 0.6960310935974121, + "learning_rate": 1e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7612711936235428, + "num_tokens": 551447292.0, + "step": 1919 + }, + { + "epoch": 0.6838824577025824, + "grad_norm": 0.7407668232917786, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7328133136034012, + "num_tokens": 551711233.0, + "step": 1920 + }, + { + "epoch": 0.6842386464826358, + "grad_norm": 0.7165109515190125, + "learning_rate": 1e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.747399166226387, + "num_tokens": 551975491.0, + "step": 1921 + }, + { + "epoch": 0.6845948352626893, + "grad_norm": 0.6552631855010986, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7736817449331284, + "num_tokens": 552299707.0, + "step": 1922 + }, + { + "epoch": 0.6849510240427427, + "grad_norm": 0.7424617409706116, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7445367127656937, + "num_tokens": 552563418.0, + "step": 1923 + }, + { + "epoch": 0.685307212822796, + "grad_norm": 0.6901903748512268, + "learning_rate": 1e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7630424201488495, + "num_tokens": 552841297.0, + "step": 1924 + }, + { + "epoch": 0.6856634016028496, + "grad_norm": 0.672964870929718, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7622916102409363, + "num_tokens": 553138108.0, + "step": 1925 + }, + { + "epoch": 0.6860195903829029, + "grad_norm": 0.7270849943161011, + "learning_rate": 1e-06, + "loss": 0.7397, + "mean_token_accuracy": 0.7666619122028351, + "num_tokens": 553400860.0, + "step": 1926 + }, + { + "epoch": 0.6863757791629563, + "grad_norm": 0.6993374228477478, + "learning_rate": 1e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7746281176805496, + "num_tokens": 553654813.0, + "step": 1927 + }, + { + "epoch": 0.6867319679430098, + "grad_norm": 0.7005759477615356, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7527921050786972, + "num_tokens": 553920069.0, + "step": 1928 + }, + { + "epoch": 0.6870881567230632, + "grad_norm": 0.6813905835151672, + "learning_rate": 1e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7606709152460098, + "num_tokens": 554189187.0, + "step": 1929 + }, + { + "epoch": 0.6874443455031166, + "grad_norm": 0.6637573838233948, + "learning_rate": 1e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.7612065821886063, + "num_tokens": 554499309.0, + "step": 1930 + }, + { + "epoch": 0.6878005342831701, + "grad_norm": 0.6780270934104919, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.764078363776207, + "num_tokens": 554803883.0, + "step": 1931 + }, + { + "epoch": 0.6881567230632235, + "grad_norm": 0.7096888422966003, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7509056925773621, + "num_tokens": 555079136.0, + "step": 1932 + }, + { + "epoch": 0.6885129118432769, + "grad_norm": 0.6793949604034424, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7718251198530197, + "num_tokens": 555363465.0, + "step": 1933 + }, + { + "epoch": 0.6888691006233304, + "grad_norm": 0.7233136296272278, + "learning_rate": 1e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7610096484422684, + "num_tokens": 555633591.0, + "step": 1934 + }, + { + "epoch": 0.6892252894033838, + "grad_norm": 0.7075399160385132, + "learning_rate": 1e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7592632323503494, + "num_tokens": 555910225.0, + "step": 1935 + }, + { + "epoch": 0.6895814781834372, + "grad_norm": 0.6793146133422852, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7591686695814133, + "num_tokens": 556187782.0, + "step": 1936 + }, + { + "epoch": 0.6899376669634907, + "grad_norm": 0.723378598690033, + "learning_rate": 1e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7545914053916931, + "num_tokens": 556441245.0, + "step": 1937 + }, + { + "epoch": 0.6902938557435441, + "grad_norm": 0.7049710750579834, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7583789229393005, + "num_tokens": 556708557.0, + "step": 1938 + }, + { + "epoch": 0.6906500445235975, + "grad_norm": 0.717097818851471, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7609783411026001, + "num_tokens": 556976192.0, + "step": 1939 + }, + { + "epoch": 0.691006233303651, + "grad_norm": 0.7188740372657776, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7441268116235733, + "num_tokens": 557249180.0, + "step": 1940 + }, + { + "epoch": 0.6913624220837044, + "grad_norm": 0.6601342558860779, + "learning_rate": 1e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7713555842638016, + "num_tokens": 557535935.0, + "step": 1941 + }, + { + "epoch": 0.6917186108637577, + "grad_norm": 0.7071787714958191, + "learning_rate": 1e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7496667206287384, + "num_tokens": 557788274.0, + "step": 1942 + }, + { + "epoch": 0.6920747996438112, + "grad_norm": 0.6646275520324707, + "learning_rate": 1e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7686352729797363, + "num_tokens": 558063894.0, + "step": 1943 + }, + { + "epoch": 0.6924309884238646, + "grad_norm": 0.706809401512146, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7614964544773102, + "num_tokens": 558319914.0, + "step": 1944 + }, + { + "epoch": 0.692787177203918, + "grad_norm": 0.6517859101295471, + "learning_rate": 1e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7535139620304108, + "num_tokens": 558618862.0, + "step": 1945 + }, + { + "epoch": 0.6931433659839715, + "grad_norm": 0.6810131669044495, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.748174861073494, + "num_tokens": 558897479.0, + "step": 1946 + }, + { + "epoch": 0.6934995547640249, + "grad_norm": 0.6415425539016724, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7652009278535843, + "num_tokens": 559217297.0, + "step": 1947 + }, + { + "epoch": 0.6938557435440783, + "grad_norm": 0.6682295799255371, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7622586637735367, + "num_tokens": 559518058.0, + "step": 1948 + }, + { + "epoch": 0.6942119323241318, + "grad_norm": 0.6816873550415039, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.752083495259285, + "num_tokens": 559798949.0, + "step": 1949 + }, + { + "epoch": 0.6945681211041852, + "grad_norm": 0.7313178777694702, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.740634024143219, + "num_tokens": 560058888.0, + "step": 1950 + }, + { + "epoch": 0.6949243098842387, + "grad_norm": 0.6806917786598206, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7438462972640991, + "num_tokens": 560350749.0, + "step": 1951 + }, + { + "epoch": 0.6952804986642921, + "grad_norm": 0.6825738549232483, + "learning_rate": 1e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7543278634548187, + "num_tokens": 560651798.0, + "step": 1952 + }, + { + "epoch": 0.6956366874443455, + "grad_norm": 0.6662881374359131, + "learning_rate": 1e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.7664649188518524, + "num_tokens": 560950249.0, + "step": 1953 + }, + { + "epoch": 0.695992876224399, + "grad_norm": 0.6783444285392761, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7560007721185684, + "num_tokens": 561255063.0, + "step": 1954 + }, + { + "epoch": 0.6963490650044524, + "grad_norm": 0.6882484555244446, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7459448128938675, + "num_tokens": 561530541.0, + "step": 1955 + }, + { + "epoch": 0.6967052537845058, + "grad_norm": 0.708329975605011, + "learning_rate": 1e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7559413611888885, + "num_tokens": 561797135.0, + "step": 1956 + }, + { + "epoch": 0.6970614425645593, + "grad_norm": 0.6562976837158203, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7644938677549362, + "num_tokens": 562110932.0, + "step": 1957 + }, + { + "epoch": 0.6974176313446127, + "grad_norm": 0.6563349366188049, + "learning_rate": 1e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.764511376619339, + "num_tokens": 562401686.0, + "step": 1958 + }, + { + "epoch": 0.697773820124666, + "grad_norm": 0.6766883134841919, + "learning_rate": 1e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7630653232336044, + "num_tokens": 562676772.0, + "step": 1959 + }, + { + "epoch": 0.6981300089047195, + "grad_norm": 0.6675964593887329, + "learning_rate": 1e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7730969339609146, + "num_tokens": 562959441.0, + "step": 1960 + }, + { + "epoch": 0.6984861976847729, + "grad_norm": 0.6997178792953491, + "learning_rate": 1e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7549775838851929, + "num_tokens": 563229316.0, + "step": 1961 + }, + { + "epoch": 0.6988423864648263, + "grad_norm": 0.6607606410980225, + "learning_rate": 1e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.7882129400968552, + "num_tokens": 563523302.0, + "step": 1962 + }, + { + "epoch": 0.6991985752448798, + "grad_norm": 0.6843350529670715, + "learning_rate": 1e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7707704603672028, + "num_tokens": 563819557.0, + "step": 1963 + }, + { + "epoch": 0.6995547640249332, + "grad_norm": 0.7295883297920227, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7623310983181, + "num_tokens": 564096692.0, + "step": 1964 + }, + { + "epoch": 0.6999109528049866, + "grad_norm": 0.6638275980949402, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7530904561281204, + "num_tokens": 564396408.0, + "step": 1965 + }, + { + "epoch": 0.7002671415850401, + "grad_norm": 0.6578758358955383, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7614713609218597, + "num_tokens": 564719868.0, + "step": 1966 + }, + { + "epoch": 0.7006233303650935, + "grad_norm": 0.6630499958992004, + "learning_rate": 1e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.7726186066865921, + "num_tokens": 565001236.0, + "step": 1967 + }, + { + "epoch": 0.7009795191451469, + "grad_norm": 0.6875316500663757, + "learning_rate": 1e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7622600197792053, + "num_tokens": 565278720.0, + "step": 1968 + }, + { + "epoch": 0.7013357079252004, + "grad_norm": 0.6710671186447144, + "learning_rate": 1e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7769084572792053, + "num_tokens": 565550815.0, + "step": 1969 + }, + { + "epoch": 0.7016918967052538, + "grad_norm": 0.6666937470436096, + "learning_rate": 1e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7634280323982239, + "num_tokens": 565844310.0, + "step": 1970 + }, + { + "epoch": 0.7020480854853072, + "grad_norm": 0.6599796414375305, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7640215456485748, + "num_tokens": 566156699.0, + "step": 1971 + }, + { + "epoch": 0.7024042742653607, + "grad_norm": 0.6749625205993652, + "learning_rate": 1e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7618428170681, + "num_tokens": 566443505.0, + "step": 1972 + }, + { + "epoch": 0.7027604630454141, + "grad_norm": 0.6515078544616699, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7579381465911865, + "num_tokens": 566785376.0, + "step": 1973 + }, + { + "epoch": 0.7031166518254675, + "grad_norm": 0.7360816597938538, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7467682063579559, + "num_tokens": 567038745.0, + "step": 1974 + }, + { + "epoch": 0.703472840605521, + "grad_norm": 0.6303831338882446, + "learning_rate": 1e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.74729423224926, + "num_tokens": 567344073.0, + "step": 1975 + }, + { + "epoch": 0.7038290293855743, + "grad_norm": 0.6908464431762695, + "learning_rate": 1e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7642059624195099, + "num_tokens": 567641394.0, + "step": 1976 + }, + { + "epoch": 0.7041852181656277, + "grad_norm": 0.6895170211791992, + "learning_rate": 1e-06, + "loss": 0.759, + "mean_token_accuracy": 0.7640035897493362, + "num_tokens": 567928941.0, + "step": 1977 + }, + { + "epoch": 0.7045414069456812, + "grad_norm": 0.6534669995307922, + "learning_rate": 1e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7663613855838776, + "num_tokens": 568226638.0, + "step": 1978 + }, + { + "epoch": 0.7048975957257346, + "grad_norm": 0.679785430431366, + "learning_rate": 1e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7525327354669571, + "num_tokens": 568505619.0, + "step": 1979 + }, + { + "epoch": 0.705253784505788, + "grad_norm": 0.7018275856971741, + "learning_rate": 1e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7593072354793549, + "num_tokens": 568789524.0, + "step": 1980 + }, + { + "epoch": 0.7056099732858415, + "grad_norm": 0.683283269405365, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7465498894453049, + "num_tokens": 569082816.0, + "step": 1981 + }, + { + "epoch": 0.7059661620658949, + "grad_norm": 0.7000215649604797, + "learning_rate": 1e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7597387433052063, + "num_tokens": 569356411.0, + "step": 1982 + }, + { + "epoch": 0.7063223508459483, + "grad_norm": 0.7248703241348267, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7350958734750748, + "num_tokens": 569616162.0, + "step": 1983 + }, + { + "epoch": 0.7066785396260018, + "grad_norm": 0.6836004257202148, + "learning_rate": 1e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.7586522549390793, + "num_tokens": 569893381.0, + "step": 1984 + }, + { + "epoch": 0.7070347284060552, + "grad_norm": 0.6815172433853149, + "learning_rate": 1e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7502942830324173, + "num_tokens": 570194866.0, + "step": 1985 + }, + { + "epoch": 0.7073909171861087, + "grad_norm": 0.6526634693145752, + "learning_rate": 1e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.7750817686319351, + "num_tokens": 570493062.0, + "step": 1986 + }, + { + "epoch": 0.7077471059661621, + "grad_norm": 0.6880009174346924, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.753306120634079, + "num_tokens": 570773311.0, + "step": 1987 + }, + { + "epoch": 0.7081032947462155, + "grad_norm": 0.675605833530426, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7684335112571716, + "num_tokens": 571064829.0, + "step": 1988 + }, + { + "epoch": 0.708459483526269, + "grad_norm": 0.6879411339759827, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7620659023523331, + "num_tokens": 571350380.0, + "step": 1989 + }, + { + "epoch": 0.7088156723063224, + "grad_norm": 0.6611787676811218, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7536574602127075, + "num_tokens": 571656333.0, + "step": 1990 + }, + { + "epoch": 0.7091718610863758, + "grad_norm": 0.6827945113182068, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7308322787284851, + "num_tokens": 571948557.0, + "step": 1991 + }, + { + "epoch": 0.7095280498664293, + "grad_norm": 0.6202526092529297, + "learning_rate": 1e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7728352248668671, + "num_tokens": 572270936.0, + "step": 1992 + }, + { + "epoch": 0.7098842386464826, + "grad_norm": 0.6324679255485535, + "learning_rate": 1e-06, + "loss": 0.7895, + "mean_token_accuracy": 0.7571640312671661, + "num_tokens": 572586657.0, + "step": 1993 + }, + { + "epoch": 0.710240427426536, + "grad_norm": 0.696519136428833, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7465923875570297, + "num_tokens": 572863593.0, + "step": 1994 + }, + { + "epoch": 0.7105966162065895, + "grad_norm": 0.6284644603729248, + "learning_rate": 1e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7734453678131104, + "num_tokens": 573162905.0, + "step": 1995 + }, + { + "epoch": 0.7109528049866429, + "grad_norm": 0.6326555609703064, + "learning_rate": 1e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7521487474441528, + "num_tokens": 573472673.0, + "step": 1996 + }, + { + "epoch": 0.7113089937666963, + "grad_norm": 0.6753260493278503, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7594591677188873, + "num_tokens": 573776129.0, + "step": 1997 + }, + { + "epoch": 0.7116651825467498, + "grad_norm": 0.7017154693603516, + "learning_rate": 1e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7633507698774338, + "num_tokens": 574032537.0, + "step": 1998 + }, + { + "epoch": 0.7120213713268032, + "grad_norm": 0.6346924901008606, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7704823762178421, + "num_tokens": 574347003.0, + "step": 1999 + }, + { + "epoch": 0.7123775601068566, + "grad_norm": 0.6603803634643555, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7401134222745895, + "num_tokens": 574637275.0, + "step": 2000 + }, + { + "epoch": 0.7127337488869101, + "grad_norm": 0.640138566493988, + "learning_rate": 1e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7796397358179092, + "num_tokens": 574954194.0, + "step": 2001 + }, + { + "epoch": 0.7130899376669635, + "grad_norm": 0.6844160556793213, + "learning_rate": 1e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.784129410982132, + "num_tokens": 575252688.0, + "step": 2002 + }, + { + "epoch": 0.7134461264470169, + "grad_norm": 0.702631950378418, + "learning_rate": 1e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.7680041491985321, + "num_tokens": 575537569.0, + "step": 2003 + }, + { + "epoch": 0.7138023152270704, + "grad_norm": 0.6695998311042786, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7640174329280853, + "num_tokens": 575819041.0, + "step": 2004 + }, + { + "epoch": 0.7141585040071238, + "grad_norm": 0.6402955651283264, + "learning_rate": 1e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.753448948264122, + "num_tokens": 576138294.0, + "step": 2005 + }, + { + "epoch": 0.7145146927871772, + "grad_norm": 0.6995715498924255, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7650301605463028, + "num_tokens": 576415677.0, + "step": 2006 + }, + { + "epoch": 0.7148708815672307, + "grad_norm": 0.6629147529602051, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7696394473314285, + "num_tokens": 576745064.0, + "step": 2007 + }, + { + "epoch": 0.715227070347284, + "grad_norm": 0.6856498718261719, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7560966610908508, + "num_tokens": 577026467.0, + "step": 2008 + }, + { + "epoch": 0.7155832591273374, + "grad_norm": 0.6555923819541931, + "learning_rate": 1e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.7594135403633118, + "num_tokens": 577315491.0, + "step": 2009 + }, + { + "epoch": 0.715939447907391, + "grad_norm": 1.1008193492889404, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7682275027036667, + "num_tokens": 577582023.0, + "step": 2010 + }, + { + "epoch": 0.7162956366874443, + "grad_norm": 0.740885317325592, + "learning_rate": 1e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.7731856107711792, + "num_tokens": 577866596.0, + "step": 2011 + }, + { + "epoch": 0.7166518254674977, + "grad_norm": 0.7031635046005249, + "learning_rate": 1e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7540151476860046, + "num_tokens": 578158937.0, + "step": 2012 + }, + { + "epoch": 0.7170080142475512, + "grad_norm": 0.72397381067276, + "learning_rate": 1e-06, + "loss": 0.7582, + "mean_token_accuracy": 0.763093888759613, + "num_tokens": 578418148.0, + "step": 2013 + }, + { + "epoch": 0.7173642030276046, + "grad_norm": 0.6891671419143677, + "learning_rate": 1e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7568914592266083, + "num_tokens": 578700541.0, + "step": 2014 + }, + { + "epoch": 0.717720391807658, + "grad_norm": 0.6528412103652954, + "learning_rate": 1e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7651544213294983, + "num_tokens": 579004974.0, + "step": 2015 + }, + { + "epoch": 0.7180765805877115, + "grad_norm": 0.6614087820053101, + "learning_rate": 1e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7594941407442093, + "num_tokens": 579328238.0, + "step": 2016 + }, + { + "epoch": 0.7184327693677649, + "grad_norm": 0.6432814598083496, + "learning_rate": 1e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7546345293521881, + "num_tokens": 579641815.0, + "step": 2017 + }, + { + "epoch": 0.7187889581478183, + "grad_norm": 0.675748884677887, + "learning_rate": 1e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7675455808639526, + "num_tokens": 579928250.0, + "step": 2018 + }, + { + "epoch": 0.7191451469278718, + "grad_norm": 0.7290952205657959, + "learning_rate": 1e-06, + "loss": 0.774, + "mean_token_accuracy": 0.7537551075220108, + "num_tokens": 580203911.0, + "step": 2019 + }, + { + "epoch": 0.7195013357079252, + "grad_norm": 0.7043212652206421, + "learning_rate": 1e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7604645788669586, + "num_tokens": 580487535.0, + "step": 2020 + }, + { + "epoch": 0.7198575244879787, + "grad_norm": 0.6916516423225403, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7624894976615906, + "num_tokens": 580753437.0, + "step": 2021 + }, + { + "epoch": 0.7202137132680321, + "grad_norm": 0.6925210356712341, + "learning_rate": 1e-06, + "loss": 0.6653, + "mean_token_accuracy": 0.7827923744916916, + "num_tokens": 581054157.0, + "step": 2022 + }, + { + "epoch": 0.7205699020480855, + "grad_norm": 0.6974393129348755, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7476927638053894, + "num_tokens": 581358295.0, + "step": 2023 + }, + { + "epoch": 0.720926090828139, + "grad_norm": 0.7150717973709106, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.762870579957962, + "num_tokens": 581638137.0, + "step": 2024 + }, + { + "epoch": 0.7212822796081924, + "grad_norm": 0.7011874914169312, + "learning_rate": 1e-06, + "loss": 0.6959, + "mean_token_accuracy": 0.7780438661575317, + "num_tokens": 581919406.0, + "step": 2025 + }, + { + "epoch": 0.7216384683882457, + "grad_norm": 0.6646504998207092, + "learning_rate": 1e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7702744752168655, + "num_tokens": 582203277.0, + "step": 2026 + }, + { + "epoch": 0.7219946571682992, + "grad_norm": 0.6772226095199585, + "learning_rate": 1e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7811168879270554, + "num_tokens": 582480721.0, + "step": 2027 + }, + { + "epoch": 0.7223508459483526, + "grad_norm": 0.6832378506660461, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7589175552129745, + "num_tokens": 582771282.0, + "step": 2028 + }, + { + "epoch": 0.722707034728406, + "grad_norm": 0.6652867197990417, + "learning_rate": 1e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.761004701256752, + "num_tokens": 583063360.0, + "step": 2029 + }, + { + "epoch": 0.7230632235084595, + "grad_norm": 0.6362511515617371, + "learning_rate": 1e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7650971561670303, + "num_tokens": 583387481.0, + "step": 2030 + }, + { + "epoch": 0.7234194122885129, + "grad_norm": 0.6612493991851807, + "learning_rate": 1e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7541236579418182, + "num_tokens": 583689461.0, + "step": 2031 + }, + { + "epoch": 0.7237756010685663, + "grad_norm": 0.7233481407165527, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7680519223213196, + "num_tokens": 583966271.0, + "step": 2032 + }, + { + "epoch": 0.7241317898486198, + "grad_norm": 0.6974051594734192, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7523021399974823, + "num_tokens": 584244053.0, + "step": 2033 + }, + { + "epoch": 0.7244879786286732, + "grad_norm": 0.6886685490608215, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7717731893062592, + "num_tokens": 584522716.0, + "step": 2034 + }, + { + "epoch": 0.7248441674087266, + "grad_norm": 0.7091543674468994, + "learning_rate": 1e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7640449851751328, + "num_tokens": 584799854.0, + "step": 2035 + }, + { + "epoch": 0.7252003561887801, + "grad_norm": 0.652128279209137, + "learning_rate": 1e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.7803038358688354, + "num_tokens": 585098492.0, + "step": 2036 + }, + { + "epoch": 0.7255565449688335, + "grad_norm": 0.7135480642318726, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.743207186460495, + "num_tokens": 585359249.0, + "step": 2037 + }, + { + "epoch": 0.7259127337488869, + "grad_norm": 0.6759265661239624, + "learning_rate": 1e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7698640674352646, + "num_tokens": 585644655.0, + "step": 2038 + }, + { + "epoch": 0.7262689225289404, + "grad_norm": 0.7308688163757324, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7548671364784241, + "num_tokens": 585920314.0, + "step": 2039 + }, + { + "epoch": 0.7266251113089938, + "grad_norm": 0.6790480613708496, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7525931000709534, + "num_tokens": 586197486.0, + "step": 2040 + }, + { + "epoch": 0.7269813000890472, + "grad_norm": 0.7046743631362915, + "learning_rate": 1e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7631720155477524, + "num_tokens": 586477868.0, + "step": 2041 + }, + { + "epoch": 0.7273374888691007, + "grad_norm": 0.6567561030387878, + "learning_rate": 1e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.7536923438310623, + "num_tokens": 586803542.0, + "step": 2042 + }, + { + "epoch": 0.727693677649154, + "grad_norm": 0.6683247089385986, + "learning_rate": 1e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.7611033916473389, + "num_tokens": 587095957.0, + "step": 2043 + }, + { + "epoch": 0.7280498664292074, + "grad_norm": 0.6836314797401428, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7568725794553757, + "num_tokens": 587382652.0, + "step": 2044 + }, + { + "epoch": 0.7284060552092609, + "grad_norm": 0.6901034712791443, + "learning_rate": 1e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7583222091197968, + "num_tokens": 587667879.0, + "step": 2045 + }, + { + "epoch": 0.7287622439893143, + "grad_norm": 0.7027484178543091, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7653377503156662, + "num_tokens": 587929519.0, + "step": 2046 + }, + { + "epoch": 0.7291184327693677, + "grad_norm": 0.667599081993103, + "learning_rate": 1e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7615005522966385, + "num_tokens": 588225979.0, + "step": 2047 + }, + { + "epoch": 0.7294746215494212, + "grad_norm": 0.6441097855567932, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.768379345536232, + "num_tokens": 588538514.0, + "step": 2048 + }, + { + "epoch": 0.7298308103294746, + "grad_norm": 0.7328298687934875, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7409410178661346, + "num_tokens": 588797400.0, + "step": 2049 + }, + { + "epoch": 0.730186999109528, + "grad_norm": 0.6629645228385925, + "learning_rate": 1e-06, + "loss": 0.746, + "mean_token_accuracy": 0.763177216053009, + "num_tokens": 589123959.0, + "step": 2050 + }, + { + "epoch": 0.7305431878895815, + "grad_norm": 0.6967955231666565, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7521119117736816, + "num_tokens": 589407017.0, + "step": 2051 + }, + { + "epoch": 0.7308993766696349, + "grad_norm": 0.7242502570152283, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7543025612831116, + "num_tokens": 589658261.0, + "step": 2052 + }, + { + "epoch": 0.7312555654496883, + "grad_norm": 0.672437310218811, + "learning_rate": 1e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7626891881227493, + "num_tokens": 589979832.0, + "step": 2053 + }, + { + "epoch": 0.7316117542297418, + "grad_norm": 0.6281040906906128, + "learning_rate": 1e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7582810819149017, + "num_tokens": 590315608.0, + "step": 2054 + }, + { + "epoch": 0.7319679430097952, + "grad_norm": 0.6861167550086975, + "learning_rate": 1e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7504431009292603, + "num_tokens": 590617787.0, + "step": 2055 + }, + { + "epoch": 0.7323241317898486, + "grad_norm": 0.6509913802146912, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7758706361055374, + "num_tokens": 590914851.0, + "step": 2056 + }, + { + "epoch": 0.7326803205699021, + "grad_norm": 0.7335070371627808, + "learning_rate": 1e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7507793605327606, + "num_tokens": 591185970.0, + "step": 2057 + }, + { + "epoch": 0.7330365093499555, + "grad_norm": 0.700545608997345, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7587338387966156, + "num_tokens": 591454637.0, + "step": 2058 + }, + { + "epoch": 0.733392698130009, + "grad_norm": 0.684907078742981, + "learning_rate": 1e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7604229748249054, + "num_tokens": 591741023.0, + "step": 2059 + }, + { + "epoch": 0.7337488869100623, + "grad_norm": 0.6763142943382263, + "learning_rate": 1e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.765076294541359, + "num_tokens": 592019874.0, + "step": 2060 + }, + { + "epoch": 0.7341050756901157, + "grad_norm": 0.7052083611488342, + "learning_rate": 1e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7602812647819519, + "num_tokens": 592297970.0, + "step": 2061 + }, + { + "epoch": 0.7344612644701692, + "grad_norm": 0.7411156296730042, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7694577425718307, + "num_tokens": 592533662.0, + "step": 2062 + }, + { + "epoch": 0.7348174532502226, + "grad_norm": 0.6625939607620239, + "learning_rate": 1e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7762722223997116, + "num_tokens": 592826086.0, + "step": 2063 + }, + { + "epoch": 0.735173642030276, + "grad_norm": 0.681991457939148, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7589648216962814, + "num_tokens": 593124998.0, + "step": 2064 + }, + { + "epoch": 0.7355298308103295, + "grad_norm": 0.6386350989341736, + "learning_rate": 1e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.770972415804863, + "num_tokens": 593446158.0, + "step": 2065 + }, + { + "epoch": 0.7358860195903829, + "grad_norm": 0.7540469169616699, + "learning_rate": 1e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7425345778465271, + "num_tokens": 593717687.0, + "step": 2066 + }, + { + "epoch": 0.7362422083704363, + "grad_norm": 0.676577627658844, + "learning_rate": 1e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7690312266349792, + "num_tokens": 594003545.0, + "step": 2067 + }, + { + "epoch": 0.7365983971504898, + "grad_norm": 0.6323738098144531, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7714697420597076, + "num_tokens": 594314618.0, + "step": 2068 + }, + { + "epoch": 0.7369545859305432, + "grad_norm": 0.7059041857719421, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7625230401754379, + "num_tokens": 594580556.0, + "step": 2069 + }, + { + "epoch": 0.7373107747105966, + "grad_norm": 0.7036329507827759, + "learning_rate": 1e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.7591599524021149, + "num_tokens": 594867509.0, + "step": 2070 + }, + { + "epoch": 0.7376669634906501, + "grad_norm": 0.6968417763710022, + "learning_rate": 1e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7437565922737122, + "num_tokens": 595141978.0, + "step": 2071 + }, + { + "epoch": 0.7380231522707035, + "grad_norm": 0.6570211052894592, + "learning_rate": 1e-06, + "loss": 0.725, + "mean_token_accuracy": 0.767461508512497, + "num_tokens": 595426509.0, + "step": 2072 + }, + { + "epoch": 0.7383793410507569, + "grad_norm": 0.7352275252342224, + "learning_rate": 1e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7507586777210236, + "num_tokens": 595696984.0, + "step": 2073 + }, + { + "epoch": 0.7387355298308104, + "grad_norm": 0.6694819927215576, + "learning_rate": 1e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.7751466035842896, + "num_tokens": 595976552.0, + "step": 2074 + }, + { + "epoch": 0.7390917186108638, + "grad_norm": 0.6798092722892761, + "learning_rate": 1e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7708611786365509, + "num_tokens": 596276242.0, + "step": 2075 + }, + { + "epoch": 0.7394479073909171, + "grad_norm": 0.673113226890564, + "learning_rate": 1e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7753875404596329, + "num_tokens": 596552166.0, + "step": 2076 + }, + { + "epoch": 0.7398040961709706, + "grad_norm": 0.6991550922393799, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7608000189065933, + "num_tokens": 596846885.0, + "step": 2077 + }, + { + "epoch": 0.740160284951024, + "grad_norm": 0.6994513273239136, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7348558008670807, + "num_tokens": 597127453.0, + "step": 2078 + }, + { + "epoch": 0.7405164737310774, + "grad_norm": 0.7408109307289124, + "learning_rate": 1e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7651698887348175, + "num_tokens": 597378162.0, + "step": 2079 + }, + { + "epoch": 0.7408726625111309, + "grad_norm": 0.6740879416465759, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7697455286979675, + "num_tokens": 597679913.0, + "step": 2080 + }, + { + "epoch": 0.7412288512911843, + "grad_norm": 0.6731386780738831, + "learning_rate": 1e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7506762742996216, + "num_tokens": 597996009.0, + "step": 2081 + }, + { + "epoch": 0.7415850400712377, + "grad_norm": 0.673046886920929, + "learning_rate": 1e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7620511204004288, + "num_tokens": 598296809.0, + "step": 2082 + }, + { + "epoch": 0.7419412288512912, + "grad_norm": 0.652625322341919, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7656930387020111, + "num_tokens": 598591024.0, + "step": 2083 + }, + { + "epoch": 0.7422974176313446, + "grad_norm": 0.717775821685791, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7505688518285751, + "num_tokens": 598884353.0, + "step": 2084 + }, + { + "epoch": 0.742653606411398, + "grad_norm": 0.6967631578445435, + "learning_rate": 1e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.759423702955246, + "num_tokens": 599162424.0, + "step": 2085 + }, + { + "epoch": 0.7430097951914515, + "grad_norm": 0.7046756744384766, + "learning_rate": 1e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7529041469097137, + "num_tokens": 599437308.0, + "step": 2086 + }, + { + "epoch": 0.7433659839715049, + "grad_norm": 0.6748732924461365, + "learning_rate": 1e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7712004482746124, + "num_tokens": 599720197.0, + "step": 2087 + }, + { + "epoch": 0.7437221727515583, + "grad_norm": 0.6667424440383911, + "learning_rate": 1e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7575422376394272, + "num_tokens": 600046075.0, + "step": 2088 + }, + { + "epoch": 0.7440783615316118, + "grad_norm": 0.727384626865387, + "learning_rate": 1e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7462697774171829, + "num_tokens": 600329171.0, + "step": 2089 + }, + { + "epoch": 0.7444345503116652, + "grad_norm": 0.6273883581161499, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.767820879817009, + "num_tokens": 600658825.0, + "step": 2090 + }, + { + "epoch": 0.7447907390917186, + "grad_norm": 0.7116818428039551, + "learning_rate": 1e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7393565326929092, + "num_tokens": 600939429.0, + "step": 2091 + }, + { + "epoch": 0.7451469278717721, + "grad_norm": 0.6432709693908691, + "learning_rate": 1e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.7693769782781601, + "num_tokens": 601262303.0, + "step": 2092 + }, + { + "epoch": 0.7455031166518254, + "grad_norm": 0.7873426675796509, + "learning_rate": 1e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7568593323230743, + "num_tokens": 601516397.0, + "step": 2093 + }, + { + "epoch": 0.745859305431879, + "grad_norm": 0.6309319734573364, + "learning_rate": 1e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7611037641763687, + "num_tokens": 601835327.0, + "step": 2094 + }, + { + "epoch": 0.7462154942119323, + "grad_norm": 0.7315190434455872, + "learning_rate": 1e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.7480624914169312, + "num_tokens": 602107797.0, + "step": 2095 + }, + { + "epoch": 0.7465716829919857, + "grad_norm": 0.6925815939903259, + "learning_rate": 1e-06, + "loss": 0.7397, + "mean_token_accuracy": 0.7683458775281906, + "num_tokens": 602417870.0, + "step": 2096 + }, + { + "epoch": 0.7469278717720392, + "grad_norm": 0.7944415211677551, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7560177445411682, + "num_tokens": 602678359.0, + "step": 2097 + }, + { + "epoch": 0.7472840605520926, + "grad_norm": 0.7004518508911133, + "learning_rate": 1e-06, + "loss": 0.6927, + "mean_token_accuracy": 0.7797493487596512, + "num_tokens": 602946280.0, + "step": 2098 + }, + { + "epoch": 0.747640249332146, + "grad_norm": 0.6828389167785645, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.74820277094841, + "num_tokens": 603229592.0, + "step": 2099 + }, + { + "epoch": 0.7479964381121995, + "grad_norm": 0.7032861709594727, + "learning_rate": 1e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7601749151945114, + "num_tokens": 603525350.0, + "step": 2100 + }, + { + "epoch": 0.7483526268922529, + "grad_norm": 0.6905547976493835, + "learning_rate": 1e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7559593170881271, + "num_tokens": 603818812.0, + "step": 2101 + }, + { + "epoch": 0.7487088156723063, + "grad_norm": 0.7595664262771606, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7597927004098892, + "num_tokens": 604104963.0, + "step": 2102 + }, + { + "epoch": 0.7490650044523598, + "grad_norm": 0.6518548727035522, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7548578232526779, + "num_tokens": 604421024.0, + "step": 2103 + }, + { + "epoch": 0.7494211932324132, + "grad_norm": 0.6758009195327759, + "learning_rate": 1e-06, + "loss": 0.7944, + "mean_token_accuracy": 0.7570549100637436, + "num_tokens": 604710775.0, + "step": 2104 + }, + { + "epoch": 0.7497773820124666, + "grad_norm": 0.6923474669456482, + "learning_rate": 1e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.7621518522500992, + "num_tokens": 604983342.0, + "step": 2105 + }, + { + "epoch": 0.7501335707925201, + "grad_norm": 0.6659507751464844, + "learning_rate": 1e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7736171334981918, + "num_tokens": 605245753.0, + "step": 2106 + }, + { + "epoch": 0.7504897595725735, + "grad_norm": 0.6918862462043762, + "learning_rate": 1e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7592862993478775, + "num_tokens": 605514044.0, + "step": 2107 + }, + { + "epoch": 0.7508459483526269, + "grad_norm": 0.719658374786377, + "learning_rate": 1e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.7678914219141006, + "num_tokens": 605774937.0, + "step": 2108 + }, + { + "epoch": 0.7512021371326804, + "grad_norm": 0.6617377400398254, + "learning_rate": 1e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.771305039525032, + "num_tokens": 606075309.0, + "step": 2109 + }, + { + "epoch": 0.7515583259127337, + "grad_norm": 0.6778964996337891, + "learning_rate": 1e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7740915566682816, + "num_tokens": 606350140.0, + "step": 2110 + }, + { + "epoch": 0.7519145146927871, + "grad_norm": 0.633124828338623, + "learning_rate": 1e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7581527084112167, + "num_tokens": 606672548.0, + "step": 2111 + }, + { + "epoch": 0.7522707034728406, + "grad_norm": 0.6742658615112305, + "learning_rate": 1e-06, + "loss": 0.6405, + "mean_token_accuracy": 0.7919428199529648, + "num_tokens": 606946570.0, + "step": 2112 + }, + { + "epoch": 0.752626892252894, + "grad_norm": 0.736462414264679, + "learning_rate": 1e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7411166280508041, + "num_tokens": 607210219.0, + "step": 2113 + }, + { + "epoch": 0.7529830810329474, + "grad_norm": 0.7053705453872681, + "learning_rate": 1e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7632166296243668, + "num_tokens": 607518617.0, + "step": 2114 + }, + { + "epoch": 0.7533392698130009, + "grad_norm": 0.6478431224822998, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7703743427991867, + "num_tokens": 607827331.0, + "step": 2115 + }, + { + "epoch": 0.7536954585930543, + "grad_norm": 0.6453032493591309, + "learning_rate": 1e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7737420201301575, + "num_tokens": 608150342.0, + "step": 2116 + }, + { + "epoch": 0.7540516473731077, + "grad_norm": 0.7499276399612427, + "learning_rate": 1e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7680772095918655, + "num_tokens": 608416799.0, + "step": 2117 + }, + { + "epoch": 0.7544078361531612, + "grad_norm": 0.6739565134048462, + "learning_rate": 1e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7586953639984131, + "num_tokens": 608704715.0, + "step": 2118 + }, + { + "epoch": 0.7547640249332146, + "grad_norm": 0.676565408706665, + "learning_rate": 1e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7585341930389404, + "num_tokens": 608996926.0, + "step": 2119 + }, + { + "epoch": 0.755120213713268, + "grad_norm": 0.6456121802330017, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7599484473466873, + "num_tokens": 609329555.0, + "step": 2120 + }, + { + "epoch": 0.7554764024933215, + "grad_norm": 0.6733473539352417, + "learning_rate": 1e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7465777397155762, + "num_tokens": 609625333.0, + "step": 2121 + }, + { + "epoch": 0.7558325912733749, + "grad_norm": 0.7412051558494568, + "learning_rate": 1e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7593388110399246, + "num_tokens": 609883093.0, + "step": 2122 + }, + { + "epoch": 0.7561887800534283, + "grad_norm": 0.6342110633850098, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7674146890640259, + "num_tokens": 610210578.0, + "step": 2123 + }, + { + "epoch": 0.7565449688334818, + "grad_norm": 0.677346408367157, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7599861472845078, + "num_tokens": 610505795.0, + "step": 2124 + }, + { + "epoch": 0.7569011576135352, + "grad_norm": 0.6662432551383972, + "learning_rate": 1e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.778209999203682, + "num_tokens": 610805608.0, + "step": 2125 + }, + { + "epoch": 0.7572573463935885, + "grad_norm": 0.6704002618789673, + "learning_rate": 1e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.7686347812414169, + "num_tokens": 611087809.0, + "step": 2126 + }, + { + "epoch": 0.757613535173642, + "grad_norm": 0.7012842297554016, + "learning_rate": 1e-06, + "loss": 0.7582, + "mean_token_accuracy": 0.7638252824544907, + "num_tokens": 611362054.0, + "step": 2127 + }, + { + "epoch": 0.7579697239536954, + "grad_norm": 0.7042873501777649, + "learning_rate": 1e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7521673887968063, + "num_tokens": 611647908.0, + "step": 2128 + }, + { + "epoch": 0.7583259127337489, + "grad_norm": 0.6490145325660706, + "learning_rate": 1e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.754746600985527, + "num_tokens": 611960583.0, + "step": 2129 + }, + { + "epoch": 0.7586821015138023, + "grad_norm": 0.6785764098167419, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7484055608510971, + "num_tokens": 612254826.0, + "step": 2130 + }, + { + "epoch": 0.7590382902938557, + "grad_norm": 0.6438121795654297, + "learning_rate": 1e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.782650500535965, + "num_tokens": 612574729.0, + "step": 2131 + }, + { + "epoch": 0.7593944790739092, + "grad_norm": 0.6526467800140381, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7615627348423004, + "num_tokens": 612886970.0, + "step": 2132 + }, + { + "epoch": 0.7597506678539626, + "grad_norm": 0.6710110306739807, + "learning_rate": 1e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7602566927671432, + "num_tokens": 613170975.0, + "step": 2133 + }, + { + "epoch": 0.760106856634016, + "grad_norm": 0.6692309379577637, + "learning_rate": 1e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7624085992574692, + "num_tokens": 613453104.0, + "step": 2134 + }, + { + "epoch": 0.7604630454140695, + "grad_norm": 0.7025964856147766, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7645601481199265, + "num_tokens": 613732038.0, + "step": 2135 + }, + { + "epoch": 0.7608192341941229, + "grad_norm": 0.7042779922485352, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7570732980966568, + "num_tokens": 614017030.0, + "step": 2136 + }, + { + "epoch": 0.7611754229741763, + "grad_norm": 0.6447991728782654, + "learning_rate": 1e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.7810547351837158, + "num_tokens": 614330980.0, + "step": 2137 + }, + { + "epoch": 0.7615316117542298, + "grad_norm": 0.659448504447937, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7493570894002914, + "num_tokens": 614655079.0, + "step": 2138 + }, + { + "epoch": 0.7618878005342832, + "grad_norm": 0.6601020693778992, + "learning_rate": 1e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7599871009588242, + "num_tokens": 614960385.0, + "step": 2139 + }, + { + "epoch": 0.7622439893143366, + "grad_norm": 0.6805604100227356, + "learning_rate": 1e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.749778226017952, + "num_tokens": 615246655.0, + "step": 2140 + }, + { + "epoch": 0.7626001780943901, + "grad_norm": 0.6412054896354675, + "learning_rate": 1e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7693493664264679, + "num_tokens": 615551290.0, + "step": 2141 + }, + { + "epoch": 0.7629563668744435, + "grad_norm": 0.6484102010726929, + "learning_rate": 1e-06, + "loss": 0.7747, + "mean_token_accuracy": 0.7527663558721542, + "num_tokens": 615866102.0, + "step": 2142 + }, + { + "epoch": 0.7633125556544968, + "grad_norm": 0.622435450553894, + "learning_rate": 1e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7544578909873962, + "num_tokens": 616214338.0, + "step": 2143 + }, + { + "epoch": 0.7636687444345504, + "grad_norm": 0.7087688446044922, + "learning_rate": 1e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.744209960103035, + "num_tokens": 616482392.0, + "step": 2144 + }, + { + "epoch": 0.7640249332146037, + "grad_norm": 0.695088803768158, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.742654949426651, + "num_tokens": 616772249.0, + "step": 2145 + }, + { + "epoch": 0.7643811219946571, + "grad_norm": 0.645632803440094, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7615219056606293, + "num_tokens": 617093518.0, + "step": 2146 + }, + { + "epoch": 0.7647373107747106, + "grad_norm": 0.6784905791282654, + "learning_rate": 1e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7516431659460068, + "num_tokens": 617399527.0, + "step": 2147 + }, + { + "epoch": 0.765093499554764, + "grad_norm": 0.7152115106582642, + "learning_rate": 1e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.7607123106718063, + "num_tokens": 617666693.0, + "step": 2148 + }, + { + "epoch": 0.7654496883348174, + "grad_norm": 0.687500536441803, + "learning_rate": 1e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.7536334097385406, + "num_tokens": 617938337.0, + "step": 2149 + }, + { + "epoch": 0.7658058771148709, + "grad_norm": 0.6958329677581787, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7670588791370392, + "num_tokens": 618219860.0, + "step": 2150 + }, + { + "epoch": 0.7661620658949243, + "grad_norm": 0.7215254902839661, + "learning_rate": 1e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7573477327823639, + "num_tokens": 618496869.0, + "step": 2151 + }, + { + "epoch": 0.7665182546749777, + "grad_norm": 0.688194990158081, + "learning_rate": 1e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.7499912828207016, + "num_tokens": 618797919.0, + "step": 2152 + }, + { + "epoch": 0.7668744434550312, + "grad_norm": 0.6729050278663635, + "learning_rate": 1e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.7815789878368378, + "num_tokens": 619080151.0, + "step": 2153 + }, + { + "epoch": 0.7672306322350846, + "grad_norm": 0.71184241771698, + "learning_rate": 1e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7472872585058212, + "num_tokens": 619354447.0, + "step": 2154 + }, + { + "epoch": 0.767586821015138, + "grad_norm": 0.7190858125686646, + "learning_rate": 1e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.7651031166315079, + "num_tokens": 619621160.0, + "step": 2155 + }, + { + "epoch": 0.7679430097951915, + "grad_norm": 0.7326929569244385, + "learning_rate": 1e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7561431974172592, + "num_tokens": 619899760.0, + "step": 2156 + }, + { + "epoch": 0.7682991985752449, + "grad_norm": 0.694175124168396, + "learning_rate": 1e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7676057517528534, + "num_tokens": 620170182.0, + "step": 2157 + }, + { + "epoch": 0.7686553873552983, + "grad_norm": 0.6800486445426941, + "learning_rate": 1e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7761542797088623, + "num_tokens": 620443907.0, + "step": 2158 + }, + { + "epoch": 0.7690115761353518, + "grad_norm": 0.6519238948822021, + "learning_rate": 1e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7617665529251099, + "num_tokens": 620764691.0, + "step": 2159 + }, + { + "epoch": 0.7693677649154052, + "grad_norm": 0.6933582425117493, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7466940581798553, + "num_tokens": 621050564.0, + "step": 2160 + }, + { + "epoch": 0.7697239536954585, + "grad_norm": 0.6759306192398071, + "learning_rate": 1e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7579586803913116, + "num_tokens": 621354762.0, + "step": 2161 + }, + { + "epoch": 0.770080142475512, + "grad_norm": 0.6677928566932678, + "learning_rate": 1e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7690781503915787, + "num_tokens": 621655966.0, + "step": 2162 + }, + { + "epoch": 0.7704363312555654, + "grad_norm": 0.6296359896659851, + "learning_rate": 1e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7816324681043625, + "num_tokens": 621958969.0, + "step": 2163 + }, + { + "epoch": 0.7707925200356189, + "grad_norm": 0.6770095229148865, + "learning_rate": 1e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7635220289230347, + "num_tokens": 622235846.0, + "step": 2164 + }, + { + "epoch": 0.7711487088156723, + "grad_norm": 0.6703883409500122, + "learning_rate": 1e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7521441429853439, + "num_tokens": 622532445.0, + "step": 2165 + }, + { + "epoch": 0.7715048975957257, + "grad_norm": 0.6607191562652588, + "learning_rate": 1e-06, + "loss": 0.7886, + "mean_token_accuracy": 0.7536987811326981, + "num_tokens": 622836614.0, + "step": 2166 + }, + { + "epoch": 0.7718610863757792, + "grad_norm": 0.7173295617103577, + "learning_rate": 1e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7507565170526505, + "num_tokens": 623089095.0, + "step": 2167 + }, + { + "epoch": 0.7722172751558326, + "grad_norm": 0.687114953994751, + "learning_rate": 1e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7655284553766251, + "num_tokens": 623373993.0, + "step": 2168 + }, + { + "epoch": 0.772573463935886, + "grad_norm": 0.6475800275802612, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7629189193248749, + "num_tokens": 623659568.0, + "step": 2169 + }, + { + "epoch": 0.7729296527159395, + "grad_norm": 0.6411698460578918, + "learning_rate": 1e-06, + "loss": 0.6411, + "mean_token_accuracy": 0.7869107276201248, + "num_tokens": 623949983.0, + "step": 2170 + }, + { + "epoch": 0.7732858414959929, + "grad_norm": 0.7038818597793579, + "learning_rate": 1e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7755648493766785, + "num_tokens": 624227056.0, + "step": 2171 + }, + { + "epoch": 0.7736420302760463, + "grad_norm": 0.7150063514709473, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7585143595933914, + "num_tokens": 624515844.0, + "step": 2172 + }, + { + "epoch": 0.7739982190560998, + "grad_norm": 0.7328695058822632, + "learning_rate": 1e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7600791603326797, + "num_tokens": 624791658.0, + "step": 2173 + }, + { + "epoch": 0.7743544078361532, + "grad_norm": 0.6681681275367737, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.747261717915535, + "num_tokens": 625100525.0, + "step": 2174 + }, + { + "epoch": 0.7747105966162066, + "grad_norm": 0.7742382884025574, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.753626361489296, + "num_tokens": 625339125.0, + "step": 2175 + }, + { + "epoch": 0.7750667853962601, + "grad_norm": 0.6842339634895325, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7569987326860428, + "num_tokens": 625637733.0, + "step": 2176 + }, + { + "epoch": 0.7754229741763135, + "grad_norm": 0.6711580753326416, + "learning_rate": 1e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7665082216262817, + "num_tokens": 625948791.0, + "step": 2177 + }, + { + "epoch": 0.7757791629563668, + "grad_norm": 0.771247923374176, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7719955593347549, + "num_tokens": 626229647.0, + "step": 2178 + }, + { + "epoch": 0.7761353517364203, + "grad_norm": 0.6788225173950195, + "learning_rate": 1e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7663293778896332, + "num_tokens": 626521839.0, + "step": 2179 + }, + { + "epoch": 0.7764915405164737, + "grad_norm": 0.6383079886436462, + "learning_rate": 1e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.750314474105835, + "num_tokens": 626854159.0, + "step": 2180 + }, + { + "epoch": 0.7768477292965271, + "grad_norm": 0.6686701774597168, + "learning_rate": 1e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7674344778060913, + "num_tokens": 627163815.0, + "step": 2181 + }, + { + "epoch": 0.7772039180765806, + "grad_norm": 0.6830407381057739, + "learning_rate": 1e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7801990658044815, + "num_tokens": 627475100.0, + "step": 2182 + }, + { + "epoch": 0.777560106856634, + "grad_norm": 0.6453263163566589, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7550675868988037, + "num_tokens": 627776767.0, + "step": 2183 + }, + { + "epoch": 0.7779162956366874, + "grad_norm": 0.6421627998352051, + "learning_rate": 1e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7739149779081345, + "num_tokens": 628086067.0, + "step": 2184 + }, + { + "epoch": 0.7782724844167409, + "grad_norm": 0.6620574593544006, + "learning_rate": 1e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.7545699924230576, + "num_tokens": 628388647.0, + "step": 2185 + }, + { + "epoch": 0.7786286731967943, + "grad_norm": 0.7198042869567871, + "learning_rate": 1e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.7686470150947571, + "num_tokens": 628661096.0, + "step": 2186 + }, + { + "epoch": 0.7789848619768477, + "grad_norm": 0.7113752365112305, + "learning_rate": 1e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7663455754518509, + "num_tokens": 628939848.0, + "step": 2187 + }, + { + "epoch": 0.7793410507569012, + "grad_norm": 0.7139882445335388, + "learning_rate": 1e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7497067153453827, + "num_tokens": 629221266.0, + "step": 2188 + }, + { + "epoch": 0.7796972395369546, + "grad_norm": 0.7186914682388306, + "learning_rate": 1e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7552275210618973, + "num_tokens": 629515865.0, + "step": 2189 + }, + { + "epoch": 0.780053428317008, + "grad_norm": 0.6845049858093262, + "learning_rate": 1e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.7581076771020889, + "num_tokens": 629813111.0, + "step": 2190 + }, + { + "epoch": 0.7804096170970615, + "grad_norm": 0.7083641886711121, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7568044066429138, + "num_tokens": 630075214.0, + "step": 2191 + }, + { + "epoch": 0.7807658058771149, + "grad_norm": 0.7025586366653442, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7639073431491852, + "num_tokens": 630380131.0, + "step": 2192 + }, + { + "epoch": 0.7811219946571683, + "grad_norm": 0.6831263303756714, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7645834386348724, + "num_tokens": 630664089.0, + "step": 2193 + }, + { + "epoch": 0.7814781834372218, + "grad_norm": 0.7137346863746643, + "learning_rate": 1e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7578794807195663, + "num_tokens": 630967935.0, + "step": 2194 + }, + { + "epoch": 0.7818343722172751, + "grad_norm": 0.6659604907035828, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.75909723341465, + "num_tokens": 631275158.0, + "step": 2195 + }, + { + "epoch": 0.7821905609973285, + "grad_norm": 0.6762608885765076, + "learning_rate": 1e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.767833948135376, + "num_tokens": 631543869.0, + "step": 2196 + }, + { + "epoch": 0.782546749777382, + "grad_norm": 0.6888765096664429, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7519587576389313, + "num_tokens": 631829437.0, + "step": 2197 + }, + { + "epoch": 0.7829029385574354, + "grad_norm": 0.7268760800361633, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7295850217342377, + "num_tokens": 632136468.0, + "step": 2198 + }, + { + "epoch": 0.7832591273374888, + "grad_norm": 0.7252092957496643, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7458752244710922, + "num_tokens": 632409174.0, + "step": 2199 + }, + { + "epoch": 0.7836153161175423, + "grad_norm": 0.6506556868553162, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7485576272010803, + "num_tokens": 632731442.0, + "step": 2200 + }, + { + "epoch": 0.7839715048975957, + "grad_norm": 0.6894707083702087, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.742341160774231, + "num_tokens": 633007450.0, + "step": 2201 + }, + { + "epoch": 0.7843276936776492, + "grad_norm": 0.6788743138313293, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7539365738630295, + "num_tokens": 633331378.0, + "step": 2202 + }, + { + "epoch": 0.7846838824577026, + "grad_norm": 0.7382684350013733, + "learning_rate": 1e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.7825951278209686, + "num_tokens": 633622132.0, + "step": 2203 + }, + { + "epoch": 0.785040071237756, + "grad_norm": 0.6882736682891846, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.736687645316124, + "num_tokens": 633923635.0, + "step": 2204 + }, + { + "epoch": 0.7853962600178095, + "grad_norm": 0.6628815531730652, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7529479563236237, + "num_tokens": 634251744.0, + "step": 2205 + }, + { + "epoch": 0.7857524487978629, + "grad_norm": 0.7491495013237, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.770216554403305, + "num_tokens": 634484357.0, + "step": 2206 + }, + { + "epoch": 0.7861086375779163, + "grad_norm": 0.6874408721923828, + "learning_rate": 1e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7470593452453613, + "num_tokens": 634774265.0, + "step": 2207 + }, + { + "epoch": 0.7864648263579698, + "grad_norm": 0.6905665993690491, + "learning_rate": 1e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7703321427106857, + "num_tokens": 635061371.0, + "step": 2208 + }, + { + "epoch": 0.7868210151380232, + "grad_norm": 0.6808715462684631, + "learning_rate": 1e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.7663966566324234, + "num_tokens": 635355457.0, + "step": 2209 + }, + { + "epoch": 0.7871772039180766, + "grad_norm": 0.680727481842041, + "learning_rate": 1e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7506867051124573, + "num_tokens": 635631392.0, + "step": 2210 + }, + { + "epoch": 0.78753339269813, + "grad_norm": 0.6583399772644043, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7643283605575562, + "num_tokens": 635925968.0, + "step": 2211 + }, + { + "epoch": 0.7878895814781834, + "grad_norm": 0.7235428094863892, + "learning_rate": 1e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7727478891611099, + "num_tokens": 636200369.0, + "step": 2212 + }, + { + "epoch": 0.7882457702582368, + "grad_norm": 0.7213712334632874, + "learning_rate": 1e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.7463674396276474, + "num_tokens": 636485082.0, + "step": 2213 + }, + { + "epoch": 0.7886019590382903, + "grad_norm": 0.6690478324890137, + "learning_rate": 1e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.7489331662654877, + "num_tokens": 636826824.0, + "step": 2214 + }, + { + "epoch": 0.7889581478183437, + "grad_norm": 0.6987161636352539, + "learning_rate": 1e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7697067260742188, + "num_tokens": 637095307.0, + "step": 2215 + }, + { + "epoch": 0.7893143365983971, + "grad_norm": 0.7417265772819519, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7321103513240814, + "num_tokens": 637388241.0, + "step": 2216 + }, + { + "epoch": 0.7896705253784506, + "grad_norm": 0.6302417516708374, + "learning_rate": 1e-06, + "loss": 0.732, + "mean_token_accuracy": 0.763682559132576, + "num_tokens": 637690112.0, + "step": 2217 + }, + { + "epoch": 0.790026714158504, + "grad_norm": 0.6779422760009766, + "learning_rate": 1e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7689340561628342, + "num_tokens": 637987063.0, + "step": 2218 + }, + { + "epoch": 0.7903829029385574, + "grad_norm": 0.6627609729766846, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7473626285791397, + "num_tokens": 638285364.0, + "step": 2219 + }, + { + "epoch": 0.7907390917186109, + "grad_norm": 0.7003564834594727, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7476512938737869, + "num_tokens": 638582068.0, + "step": 2220 + }, + { + "epoch": 0.7910952804986643, + "grad_norm": 0.6537190079689026, + "learning_rate": 1e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7502158582210541, + "num_tokens": 638890431.0, + "step": 2221 + }, + { + "epoch": 0.7914514692787177, + "grad_norm": 0.669823408126831, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7550792396068573, + "num_tokens": 639190287.0, + "step": 2222 + }, + { + "epoch": 0.7918076580587712, + "grad_norm": 0.6851491332054138, + "learning_rate": 1e-06, + "loss": 0.7905, + "mean_token_accuracy": 0.7553387880325317, + "num_tokens": 639454839.0, + "step": 2223 + }, + { + "epoch": 0.7921638468388246, + "grad_norm": 0.6900203227996826, + "learning_rate": 1e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.754290297627449, + "num_tokens": 639739526.0, + "step": 2224 + }, + { + "epoch": 0.792520035618878, + "grad_norm": 0.7180584669113159, + "learning_rate": 1e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7618732154369354, + "num_tokens": 640014040.0, + "step": 2225 + }, + { + "epoch": 0.7928762243989315, + "grad_norm": 0.6851544380187988, + "learning_rate": 1e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7656569182872772, + "num_tokens": 640333679.0, + "step": 2226 + }, + { + "epoch": 0.7932324131789849, + "grad_norm": 0.7273793816566467, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7554309964179993, + "num_tokens": 640605664.0, + "step": 2227 + }, + { + "epoch": 0.7935886019590382, + "grad_norm": 0.6639474630355835, + "learning_rate": 1e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.7750280946493149, + "num_tokens": 640874971.0, + "step": 2228 + }, + { + "epoch": 0.7939447907390917, + "grad_norm": 0.6818947196006775, + "learning_rate": 1e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7611217647790909, + "num_tokens": 641155507.0, + "step": 2229 + }, + { + "epoch": 0.7943009795191451, + "grad_norm": 0.6751012802124023, + "learning_rate": 1e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7628983408212662, + "num_tokens": 641440559.0, + "step": 2230 + }, + { + "epoch": 0.7946571682991985, + "grad_norm": 0.7145196795463562, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.768434152007103, + "num_tokens": 641712434.0, + "step": 2231 + }, + { + "epoch": 0.795013357079252, + "grad_norm": 0.6662778854370117, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7477295249700546, + "num_tokens": 642014586.0, + "step": 2232 + }, + { + "epoch": 0.7953695458593054, + "grad_norm": 0.7014389634132385, + "learning_rate": 1e-06, + "loss": 0.78, + "mean_token_accuracy": 0.7528580576181412, + "num_tokens": 642283292.0, + "step": 2233 + }, + { + "epoch": 0.7957257346393588, + "grad_norm": 0.6792690753936768, + "learning_rate": 1e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7629737108945847, + "num_tokens": 642581344.0, + "step": 2234 + }, + { + "epoch": 0.7960819234194123, + "grad_norm": 0.6928903460502625, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7710779011249542, + "num_tokens": 642882280.0, + "step": 2235 + }, + { + "epoch": 0.7964381121994657, + "grad_norm": 0.6789652109146118, + "learning_rate": 1e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7575213611125946, + "num_tokens": 643174039.0, + "step": 2236 + }, + { + "epoch": 0.7967943009795192, + "grad_norm": 0.6555347442626953, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7719069719314575, + "num_tokens": 643460975.0, + "step": 2237 + }, + { + "epoch": 0.7971504897595726, + "grad_norm": 0.6577290892601013, + "learning_rate": 1e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.7536243200302124, + "num_tokens": 643764810.0, + "step": 2238 + }, + { + "epoch": 0.797506678539626, + "grad_norm": 0.6805257201194763, + "learning_rate": 1e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.7721560746431351, + "num_tokens": 644077396.0, + "step": 2239 + }, + { + "epoch": 0.7978628673196795, + "grad_norm": 0.6822432279586792, + "learning_rate": 1e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7637694180011749, + "num_tokens": 644365247.0, + "step": 2240 + }, + { + "epoch": 0.7982190560997329, + "grad_norm": 0.7284189462661743, + "learning_rate": 1e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7433735132217407, + "num_tokens": 644646347.0, + "step": 2241 + }, + { + "epoch": 0.7985752448797863, + "grad_norm": 0.6233003735542297, + "learning_rate": 1e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7795185744762421, + "num_tokens": 644980199.0, + "step": 2242 + }, + { + "epoch": 0.7989314336598398, + "grad_norm": 0.6922442317008972, + "learning_rate": 1e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.7601959854364395, + "num_tokens": 645248047.0, + "step": 2243 + }, + { + "epoch": 0.7992876224398932, + "grad_norm": 0.6920552849769592, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7577676177024841, + "num_tokens": 645523810.0, + "step": 2244 + }, + { + "epoch": 0.7996438112199465, + "grad_norm": 0.6697332859039307, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7602120041847229, + "num_tokens": 645818707.0, + "step": 2245 + }, + { + "epoch": 0.8, + "grad_norm": 0.6840434074401855, + "learning_rate": 1e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7533604353666306, + "num_tokens": 646101758.0, + "step": 2246 + }, + { + "epoch": 0.8003561887800534, + "grad_norm": 0.6692554354667664, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.752812996506691, + "num_tokens": 646416276.0, + "step": 2247 + }, + { + "epoch": 0.8007123775601068, + "grad_norm": 0.666330099105835, + "learning_rate": 1e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.7729169726371765, + "num_tokens": 646730598.0, + "step": 2248 + }, + { + "epoch": 0.8010685663401603, + "grad_norm": 0.6714534163475037, + "learning_rate": 1e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.782706081867218, + "num_tokens": 647003604.0, + "step": 2249 + }, + { + "epoch": 0.8014247551202137, + "grad_norm": 0.6726563572883606, + "learning_rate": 1e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7621323019266129, + "num_tokens": 647279003.0, + "step": 2250 + }, + { + "epoch": 0.8017809439002671, + "grad_norm": 0.6701948046684265, + "learning_rate": 1e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7554466426372528, + "num_tokens": 647568937.0, + "step": 2251 + }, + { + "epoch": 0.8021371326803206, + "grad_norm": 0.6997784376144409, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7656364589929581, + "num_tokens": 647864243.0, + "step": 2252 + }, + { + "epoch": 0.802493321460374, + "grad_norm": 0.6900641918182373, + "learning_rate": 1e-06, + "loss": 0.7319, + "mean_token_accuracy": 0.7657664120197296, + "num_tokens": 648151031.0, + "step": 2253 + }, + { + "epoch": 0.8028495102404274, + "grad_norm": 0.6727526783943176, + "learning_rate": 1e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7533716261386871, + "num_tokens": 648435176.0, + "step": 2254 + }, + { + "epoch": 0.8032056990204809, + "grad_norm": 0.7218172550201416, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7631248086690903, + "num_tokens": 648694014.0, + "step": 2255 + }, + { + "epoch": 0.8035618878005343, + "grad_norm": 0.6908507347106934, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.74900022149086, + "num_tokens": 648971578.0, + "step": 2256 + }, + { + "epoch": 0.8039180765805877, + "grad_norm": 0.6616504192352295, + "learning_rate": 1e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7719068229198456, + "num_tokens": 649256429.0, + "step": 2257 + }, + { + "epoch": 0.8042742653606412, + "grad_norm": 0.6677389144897461, + "learning_rate": 1e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7689089328050613, + "num_tokens": 649535188.0, + "step": 2258 + }, + { + "epoch": 0.8046304541406946, + "grad_norm": 0.677201509475708, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.761580616235733, + "num_tokens": 649817811.0, + "step": 2259 + }, + { + "epoch": 0.804986642920748, + "grad_norm": 0.6658285856246948, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7527831941843033, + "num_tokens": 650110660.0, + "step": 2260 + }, + { + "epoch": 0.8053428317008015, + "grad_norm": 0.7230237126350403, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.758300170302391, + "num_tokens": 650377370.0, + "step": 2261 + }, + { + "epoch": 0.8056990204808548, + "grad_norm": 0.6528153419494629, + "learning_rate": 1e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7656248211860657, + "num_tokens": 650671140.0, + "step": 2262 + }, + { + "epoch": 0.8060552092609082, + "grad_norm": 0.7769148349761963, + "learning_rate": 1e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7604284733533859, + "num_tokens": 650939126.0, + "step": 2263 + }, + { + "epoch": 0.8064113980409617, + "grad_norm": 0.7336376309394836, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.763325423002243, + "num_tokens": 651205951.0, + "step": 2264 + }, + { + "epoch": 0.8067675868210151, + "grad_norm": 0.675431489944458, + "learning_rate": 1e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7523181736469269, + "num_tokens": 651499573.0, + "step": 2265 + }, + { + "epoch": 0.8071237756010685, + "grad_norm": 0.6755803823471069, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7519099712371826, + "num_tokens": 651785073.0, + "step": 2266 + }, + { + "epoch": 0.807479964381122, + "grad_norm": 0.6910105347633362, + "learning_rate": 1e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7489993572235107, + "num_tokens": 652089506.0, + "step": 2267 + }, + { + "epoch": 0.8078361531611754, + "grad_norm": 0.7479743957519531, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.735759437084198, + "num_tokens": 652331430.0, + "step": 2268 + }, + { + "epoch": 0.8081923419412288, + "grad_norm": 0.6830007433891296, + "learning_rate": 1e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7556213140487671, + "num_tokens": 652600063.0, + "step": 2269 + }, + { + "epoch": 0.8085485307212823, + "grad_norm": 0.65208899974823, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7559776604175568, + "num_tokens": 652896655.0, + "step": 2270 + }, + { + "epoch": 0.8089047195013357, + "grad_norm": 0.7047868967056274, + "learning_rate": 1e-06, + "loss": 0.7007, + "mean_token_accuracy": 0.775927260518074, + "num_tokens": 653169014.0, + "step": 2271 + }, + { + "epoch": 0.8092609082813892, + "grad_norm": 0.6601242423057556, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7677312940359116, + "num_tokens": 653491576.0, + "step": 2272 + }, + { + "epoch": 0.8096170970614426, + "grad_norm": 0.6777330636978149, + "learning_rate": 1e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7680913656949997, + "num_tokens": 653776806.0, + "step": 2273 + }, + { + "epoch": 0.809973285841496, + "grad_norm": 0.6846239566802979, + "learning_rate": 1e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7574449926614761, + "num_tokens": 654047281.0, + "step": 2274 + }, + { + "epoch": 0.8103294746215495, + "grad_norm": 0.6671757102012634, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7589717656373978, + "num_tokens": 654326946.0, + "step": 2275 + }, + { + "epoch": 0.8106856634016029, + "grad_norm": 0.6579907536506653, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7358271777629852, + "num_tokens": 654623088.0, + "step": 2276 + }, + { + "epoch": 0.8110418521816563, + "grad_norm": 0.6782415509223938, + "learning_rate": 1e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7536306232213974, + "num_tokens": 654921912.0, + "step": 2277 + }, + { + "epoch": 0.8113980409617098, + "grad_norm": 0.6437388062477112, + "learning_rate": 1e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7727184593677521, + "num_tokens": 655199745.0, + "step": 2278 + }, + { + "epoch": 0.8117542297417631, + "grad_norm": 0.675218939781189, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7655206918716431, + "num_tokens": 655493440.0, + "step": 2279 + }, + { + "epoch": 0.8121104185218165, + "grad_norm": 0.6486790180206299, + "learning_rate": 1e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7634501308202744, + "num_tokens": 655807552.0, + "step": 2280 + }, + { + "epoch": 0.81246660730187, + "grad_norm": 0.6578036546707153, + "learning_rate": 1e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7586449235677719, + "num_tokens": 656107869.0, + "step": 2281 + }, + { + "epoch": 0.8128227960819234, + "grad_norm": 0.6646526455879211, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7550256699323654, + "num_tokens": 656387427.0, + "step": 2282 + }, + { + "epoch": 0.8131789848619768, + "grad_norm": 0.6942019462585449, + "learning_rate": 1e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.75694639980793, + "num_tokens": 656642150.0, + "step": 2283 + }, + { + "epoch": 0.8135351736420303, + "grad_norm": 0.6780902743339539, + "learning_rate": 1e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7743101418018341, + "num_tokens": 656915753.0, + "step": 2284 + }, + { + "epoch": 0.8138913624220837, + "grad_norm": 0.7178621888160706, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7431769669055939, + "num_tokens": 657175598.0, + "step": 2285 + }, + { + "epoch": 0.8142475512021371, + "grad_norm": 0.701992928981781, + "learning_rate": 1e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7574764788150787, + "num_tokens": 657476507.0, + "step": 2286 + }, + { + "epoch": 0.8146037399821906, + "grad_norm": 0.6667888760566711, + "learning_rate": 1e-06, + "loss": 0.6946, + "mean_token_accuracy": 0.7757705301046371, + "num_tokens": 657773704.0, + "step": 2287 + }, + { + "epoch": 0.814959928762244, + "grad_norm": 0.6448433995246887, + "learning_rate": 1e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.778081864118576, + "num_tokens": 658081832.0, + "step": 2288 + }, + { + "epoch": 0.8153161175422974, + "grad_norm": 0.7291236519813538, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7503075748682022, + "num_tokens": 658352899.0, + "step": 2289 + }, + { + "epoch": 0.8156723063223509, + "grad_norm": 0.7080919146537781, + "learning_rate": 1e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.779916524887085, + "num_tokens": 658625178.0, + "step": 2290 + }, + { + "epoch": 0.8160284951024043, + "grad_norm": 0.6684027314186096, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7626764625310898, + "num_tokens": 658931863.0, + "step": 2291 + }, + { + "epoch": 0.8163846838824577, + "grad_norm": 0.657751202583313, + "learning_rate": 1e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7676074802875519, + "num_tokens": 659227638.0, + "step": 2292 + }, + { + "epoch": 0.8167408726625112, + "grad_norm": 0.6540340781211853, + "learning_rate": 1e-06, + "loss": 0.6545, + "mean_token_accuracy": 0.7853255718946457, + "num_tokens": 659515580.0, + "step": 2293 + }, + { + "epoch": 0.8170970614425646, + "grad_norm": 0.6894474029541016, + "learning_rate": 1e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.7802927494049072, + "num_tokens": 659805635.0, + "step": 2294 + }, + { + "epoch": 0.8174532502226179, + "grad_norm": 0.6619656682014465, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7560590654611588, + "num_tokens": 660110950.0, + "step": 2295 + }, + { + "epoch": 0.8178094390026714, + "grad_norm": 0.6761106848716736, + "learning_rate": 1e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7603833079338074, + "num_tokens": 660399345.0, + "step": 2296 + }, + { + "epoch": 0.8181656277827248, + "grad_norm": 0.6567825078964233, + "learning_rate": 1e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.765841156244278, + "num_tokens": 660693544.0, + "step": 2297 + }, + { + "epoch": 0.8185218165627782, + "grad_norm": 0.6893510222434998, + "learning_rate": 1e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7413447797298431, + "num_tokens": 660971249.0, + "step": 2298 + }, + { + "epoch": 0.8188780053428317, + "grad_norm": 0.6889421939849854, + "learning_rate": 1e-06, + "loss": 0.6438, + "mean_token_accuracy": 0.7914816588163376, + "num_tokens": 661235818.0, + "step": 2299 + }, + { + "epoch": 0.8192341941228851, + "grad_norm": 0.6504808664321899, + "learning_rate": 1e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7736584097146988, + "num_tokens": 661529386.0, + "step": 2300 + }, + { + "epoch": 0.8195903829029385, + "grad_norm": 0.680389404296875, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7529180347919464, + "num_tokens": 661819544.0, + "step": 2301 + }, + { + "epoch": 0.819946571682992, + "grad_norm": 0.6856846809387207, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7618640810251236, + "num_tokens": 662100254.0, + "step": 2302 + }, + { + "epoch": 0.8203027604630454, + "grad_norm": 0.7136067152023315, + "learning_rate": 1e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7607178092002869, + "num_tokens": 662373943.0, + "step": 2303 + }, + { + "epoch": 0.8206589492430988, + "grad_norm": 0.6659607887268066, + "learning_rate": 1e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.773666724562645, + "num_tokens": 662665502.0, + "step": 2304 + }, + { + "epoch": 0.8210151380231523, + "grad_norm": 0.6674306392669678, + "learning_rate": 1e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7642912417650223, + "num_tokens": 662961833.0, + "step": 2305 + }, + { + "epoch": 0.8213713268032057, + "grad_norm": 0.707481324672699, + "learning_rate": 1e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.753974050283432, + "num_tokens": 663240466.0, + "step": 2306 + }, + { + "epoch": 0.8217275155832592, + "grad_norm": 0.6828428506851196, + "learning_rate": 1e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.757883295416832, + "num_tokens": 663498907.0, + "step": 2307 + }, + { + "epoch": 0.8220837043633126, + "grad_norm": 0.6980103850364685, + "learning_rate": 1e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7501301765441895, + "num_tokens": 663772196.0, + "step": 2308 + }, + { + "epoch": 0.822439893143366, + "grad_norm": 0.7218775749206543, + "learning_rate": 1e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.749781146645546, + "num_tokens": 664072498.0, + "step": 2309 + }, + { + "epoch": 0.8227960819234195, + "grad_norm": 0.7180657386779785, + "learning_rate": 1e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7637830823659897, + "num_tokens": 664325084.0, + "step": 2310 + }, + { + "epoch": 0.8231522707034729, + "grad_norm": 0.6464791297912598, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7550983875989914, + "num_tokens": 664642251.0, + "step": 2311 + }, + { + "epoch": 0.8235084594835262, + "grad_norm": 0.6757043600082397, + "learning_rate": 1e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7721974551677704, + "num_tokens": 664951191.0, + "step": 2312 + }, + { + "epoch": 0.8238646482635797, + "grad_norm": 0.7649275064468384, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7530903667211533, + "num_tokens": 665206451.0, + "step": 2313 + }, + { + "epoch": 0.8242208370436331, + "grad_norm": 0.6332659125328064, + "learning_rate": 1e-06, + "loss": 0.697, + "mean_token_accuracy": 0.780927300453186, + "num_tokens": 665523038.0, + "step": 2314 + }, + { + "epoch": 0.8245770258236865, + "grad_norm": 0.691163957118988, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7428819686174393, + "num_tokens": 665817699.0, + "step": 2315 + }, + { + "epoch": 0.82493321460374, + "grad_norm": 0.6586810946464539, + "learning_rate": 1e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7640466392040253, + "num_tokens": 666108383.0, + "step": 2316 + }, + { + "epoch": 0.8252894033837934, + "grad_norm": 0.7090243697166443, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7468613237142563, + "num_tokens": 666378119.0, + "step": 2317 + }, + { + "epoch": 0.8256455921638468, + "grad_norm": 0.7190473079681396, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7661720365285873, + "num_tokens": 666661466.0, + "step": 2318 + }, + { + "epoch": 0.8260017809439003, + "grad_norm": 0.6994612216949463, + "learning_rate": 1e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7745846509933472, + "num_tokens": 666938018.0, + "step": 2319 + }, + { + "epoch": 0.8263579697239537, + "grad_norm": 0.6558566093444824, + "learning_rate": 1e-06, + "loss": 0.7304, + "mean_token_accuracy": 0.7676221430301666, + "num_tokens": 667250098.0, + "step": 2320 + }, + { + "epoch": 0.8267141585040071, + "grad_norm": 0.673427402973175, + "learning_rate": 1e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7645684033632278, + "num_tokens": 667516875.0, + "step": 2321 + }, + { + "epoch": 0.8270703472840606, + "grad_norm": 0.6758596897125244, + "learning_rate": 1e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7532147616147995, + "num_tokens": 667805785.0, + "step": 2322 + }, + { + "epoch": 0.827426536064114, + "grad_norm": 0.7380167245864868, + "learning_rate": 1e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.7676801234483719, + "num_tokens": 668064776.0, + "step": 2323 + }, + { + "epoch": 0.8277827248441674, + "grad_norm": 0.6771019101142883, + "learning_rate": 1e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.7522769868373871, + "num_tokens": 668368661.0, + "step": 2324 + }, + { + "epoch": 0.8281389136242209, + "grad_norm": 0.6817558407783508, + "learning_rate": 1e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7593671381473541, + "num_tokens": 668673454.0, + "step": 2325 + }, + { + "epoch": 0.8284951024042743, + "grad_norm": 0.6356712579727173, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7598068863153458, + "num_tokens": 668978489.0, + "step": 2326 + }, + { + "epoch": 0.8288512911843277, + "grad_norm": 0.6698744297027588, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7700797915458679, + "num_tokens": 669250022.0, + "step": 2327 + }, + { + "epoch": 0.8292074799643812, + "grad_norm": 0.6487880349159241, + "learning_rate": 1e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7478306293487549, + "num_tokens": 669544707.0, + "step": 2328 + }, + { + "epoch": 0.8295636687444345, + "grad_norm": 0.7294020056724548, + "learning_rate": 1e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7659475654363632, + "num_tokens": 669809823.0, + "step": 2329 + }, + { + "epoch": 0.8299198575244879, + "grad_norm": 0.6735602617263794, + "learning_rate": 1e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7601701319217682, + "num_tokens": 670090439.0, + "step": 2330 + }, + { + "epoch": 0.8302760463045414, + "grad_norm": 0.6365821957588196, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.753448098897934, + "num_tokens": 670405138.0, + "step": 2331 + }, + { + "epoch": 0.8306322350845948, + "grad_norm": 0.700553297996521, + "learning_rate": 1e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7524193972349167, + "num_tokens": 670682780.0, + "step": 2332 + }, + { + "epoch": 0.8309884238646482, + "grad_norm": 0.6535807251930237, + "learning_rate": 1e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7509471923112869, + "num_tokens": 671008179.0, + "step": 2333 + }, + { + "epoch": 0.8313446126447017, + "grad_norm": 0.667327344417572, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7666208446025848, + "num_tokens": 671301286.0, + "step": 2334 + }, + { + "epoch": 0.8317008014247551, + "grad_norm": 0.6961522102355957, + "learning_rate": 1e-06, + "loss": 0.781, + "mean_token_accuracy": 0.7542032897472382, + "num_tokens": 671591454.0, + "step": 2335 + }, + { + "epoch": 0.8320569902048085, + "grad_norm": 0.677208423614502, + "learning_rate": 1e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7542543262243271, + "num_tokens": 671881819.0, + "step": 2336 + }, + { + "epoch": 0.832413178984862, + "grad_norm": 0.6834362745285034, + "learning_rate": 1e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7684956789016724, + "num_tokens": 672188380.0, + "step": 2337 + }, + { + "epoch": 0.8327693677649154, + "grad_norm": 0.6773068308830261, + "learning_rate": 1e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7570211440324783, + "num_tokens": 672466540.0, + "step": 2338 + }, + { + "epoch": 0.8331255565449688, + "grad_norm": 0.7335455417633057, + "learning_rate": 1e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7747850567102432, + "num_tokens": 672730114.0, + "step": 2339 + }, + { + "epoch": 0.8334817453250223, + "grad_norm": 0.705029308795929, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7492866218090057, + "num_tokens": 672989689.0, + "step": 2340 + }, + { + "epoch": 0.8338379341050757, + "grad_norm": 0.7298811078071594, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7494656890630722, + "num_tokens": 673274451.0, + "step": 2341 + }, + { + "epoch": 0.8341941228851291, + "grad_norm": 0.7522141337394714, + "learning_rate": 1e-06, + "loss": 0.72, + "mean_token_accuracy": 0.770646333694458, + "num_tokens": 673542262.0, + "step": 2342 + }, + { + "epoch": 0.8345503116651826, + "grad_norm": 0.6631610989570618, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7670901864767075, + "num_tokens": 673877998.0, + "step": 2343 + }, + { + "epoch": 0.834906500445236, + "grad_norm": 0.6542699337005615, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7490192651748657, + "num_tokens": 674187044.0, + "step": 2344 + }, + { + "epoch": 0.8352626892252895, + "grad_norm": 0.6507307291030884, + "learning_rate": 1e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7770354896783829, + "num_tokens": 674486543.0, + "step": 2345 + }, + { + "epoch": 0.8356188780053428, + "grad_norm": 0.6914659738540649, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7626991420984268, + "num_tokens": 674789365.0, + "step": 2346 + }, + { + "epoch": 0.8359750667853962, + "grad_norm": 0.6904114484786987, + "learning_rate": 1e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.745251476764679, + "num_tokens": 675059314.0, + "step": 2347 + }, + { + "epoch": 0.8363312555654497, + "grad_norm": 0.6988269686698914, + "learning_rate": 1e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.7745276838541031, + "num_tokens": 675322854.0, + "step": 2348 + }, + { + "epoch": 0.8366874443455031, + "grad_norm": 0.6689270734786987, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7708797752857208, + "num_tokens": 675608210.0, + "step": 2349 + }, + { + "epoch": 0.8370436331255565, + "grad_norm": 0.6902801990509033, + "learning_rate": 1e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.756140872836113, + "num_tokens": 675875192.0, + "step": 2350 + }, + { + "epoch": 0.83739982190561, + "grad_norm": 0.7109673023223877, + "learning_rate": 1e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7454496473073959, + "num_tokens": 676146116.0, + "step": 2351 + }, + { + "epoch": 0.8377560106856634, + "grad_norm": 0.657339870929718, + "learning_rate": 1e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7419613599777222, + "num_tokens": 676457819.0, + "step": 2352 + }, + { + "epoch": 0.8381121994657168, + "grad_norm": 0.7173905372619629, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.7621938139200211, + "num_tokens": 676736154.0, + "step": 2353 + }, + { + "epoch": 0.8384683882457703, + "grad_norm": 0.6297332644462585, + "learning_rate": 1e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.7880305200815201, + "num_tokens": 677077098.0, + "step": 2354 + }, + { + "epoch": 0.8388245770258237, + "grad_norm": 0.6879311203956604, + "learning_rate": 1e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7642193138599396, + "num_tokens": 677342091.0, + "step": 2355 + }, + { + "epoch": 0.8391807658058771, + "grad_norm": 0.6751154661178589, + "learning_rate": 1e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7754417061805725, + "num_tokens": 677622063.0, + "step": 2356 + }, + { + "epoch": 0.8395369545859306, + "grad_norm": 0.6575679779052734, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7509424388408661, + "num_tokens": 677921651.0, + "step": 2357 + }, + { + "epoch": 0.839893143365984, + "grad_norm": 0.7044042944908142, + "learning_rate": 1e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7528955638408661, + "num_tokens": 678191613.0, + "step": 2358 + }, + { + "epoch": 0.8402493321460374, + "grad_norm": 0.6888781189918518, + "learning_rate": 1e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7691991925239563, + "num_tokens": 678495247.0, + "step": 2359 + }, + { + "epoch": 0.8406055209260909, + "grad_norm": 0.6251295208930969, + "learning_rate": 1e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7711558640003204, + "num_tokens": 678806848.0, + "step": 2360 + }, + { + "epoch": 0.8409617097061443, + "grad_norm": 0.6809543371200562, + "learning_rate": 1e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.7632822394371033, + "num_tokens": 679089321.0, + "step": 2361 + }, + { + "epoch": 0.8413178984861976, + "grad_norm": 0.6830254793167114, + "learning_rate": 1e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7556702494621277, + "num_tokens": 679359495.0, + "step": 2362 + }, + { + "epoch": 0.8416740872662511, + "grad_norm": 0.6303315162658691, + "learning_rate": 1e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7562137097120285, + "num_tokens": 679697148.0, + "step": 2363 + }, + { + "epoch": 0.8420302760463045, + "grad_norm": 0.6976258158683777, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7440847605466843, + "num_tokens": 679962536.0, + "step": 2364 + }, + { + "epoch": 0.8423864648263579, + "grad_norm": 0.6440044045448303, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7417994141578674, + "num_tokens": 680281105.0, + "step": 2365 + }, + { + "epoch": 0.8427426536064114, + "grad_norm": 0.6984385251998901, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7626495957374573, + "num_tokens": 680557733.0, + "step": 2366 + }, + { + "epoch": 0.8430988423864648, + "grad_norm": 0.6164286136627197, + "learning_rate": 1e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7630018591880798, + "num_tokens": 680921126.0, + "step": 2367 + }, + { + "epoch": 0.8434550311665182, + "grad_norm": 0.6646066308021545, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7585286945104599, + "num_tokens": 681204866.0, + "step": 2368 + }, + { + "epoch": 0.8438112199465717, + "grad_norm": 0.7102413177490234, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7661050111055374, + "num_tokens": 681460987.0, + "step": 2369 + }, + { + "epoch": 0.8441674087266251, + "grad_norm": 0.6674376726150513, + "learning_rate": 1e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7673872411251068, + "num_tokens": 681751502.0, + "step": 2370 + }, + { + "epoch": 0.8445235975066785, + "grad_norm": 0.663684070110321, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.739754319190979, + "num_tokens": 682049408.0, + "step": 2371 + }, + { + "epoch": 0.844879786286732, + "grad_norm": 0.6926496624946594, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7607071250677109, + "num_tokens": 682323554.0, + "step": 2372 + }, + { + "epoch": 0.8452359750667854, + "grad_norm": 0.6670950055122375, + "learning_rate": 1e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7576422840356827, + "num_tokens": 682637278.0, + "step": 2373 + }, + { + "epoch": 0.8455921638468388, + "grad_norm": 0.6583961248397827, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7532201409339905, + "num_tokens": 682937565.0, + "step": 2374 + }, + { + "epoch": 0.8459483526268923, + "grad_norm": 0.6615502834320068, + "learning_rate": 1e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7591888457536697, + "num_tokens": 683237492.0, + "step": 2375 + }, + { + "epoch": 0.8463045414069457, + "grad_norm": 0.6829603910446167, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7689686268568039, + "num_tokens": 683503827.0, + "step": 2376 + }, + { + "epoch": 0.846660730186999, + "grad_norm": 0.6842308044433594, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7700541317462921, + "num_tokens": 683795395.0, + "step": 2377 + }, + { + "epoch": 0.8470169189670526, + "grad_norm": 0.6811245679855347, + "learning_rate": 1e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7812516093254089, + "num_tokens": 684094330.0, + "step": 2378 + }, + { + "epoch": 0.847373107747106, + "grad_norm": 0.6819302439689636, + "learning_rate": 1e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7724129408597946, + "num_tokens": 684382017.0, + "step": 2379 + }, + { + "epoch": 0.8477292965271594, + "grad_norm": 0.691041886806488, + "learning_rate": 1e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7707984447479248, + "num_tokens": 684657201.0, + "step": 2380 + }, + { + "epoch": 0.8480854853072128, + "grad_norm": 0.7043678164482117, + "learning_rate": 1e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7789788097143173, + "num_tokens": 684911711.0, + "step": 2381 + }, + { + "epoch": 0.8484416740872662, + "grad_norm": 0.6643552780151367, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7686491012573242, + "num_tokens": 685200785.0, + "step": 2382 + }, + { + "epoch": 0.8487978628673197, + "grad_norm": 0.7135539054870605, + "learning_rate": 1e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7529379725456238, + "num_tokens": 685460491.0, + "step": 2383 + }, + { + "epoch": 0.8491540516473731, + "grad_norm": 0.6739833950996399, + "learning_rate": 1e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7731255292892456, + "num_tokens": 685754056.0, + "step": 2384 + }, + { + "epoch": 0.8495102404274265, + "grad_norm": 0.6705874800682068, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7503114342689514, + "num_tokens": 686071498.0, + "step": 2385 + }, + { + "epoch": 0.84986642920748, + "grad_norm": 0.7034271359443665, + "learning_rate": 1e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7667562812566757, + "num_tokens": 686338728.0, + "step": 2386 + }, + { + "epoch": 0.8502226179875334, + "grad_norm": 0.6374067664146423, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.759790763258934, + "num_tokens": 686651298.0, + "step": 2387 + }, + { + "epoch": 0.8505788067675868, + "grad_norm": 0.6674380898475647, + "learning_rate": 1e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7601442486047745, + "num_tokens": 686945955.0, + "step": 2388 + }, + { + "epoch": 0.8509349955476403, + "grad_norm": 0.6634024381637573, + "learning_rate": 1e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.768571674823761, + "num_tokens": 687249457.0, + "step": 2389 + }, + { + "epoch": 0.8512911843276937, + "grad_norm": 0.6249246597290039, + "learning_rate": 1e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7538134753704071, + "num_tokens": 687574058.0, + "step": 2390 + }, + { + "epoch": 0.8516473731077471, + "grad_norm": 0.6733210682868958, + "learning_rate": 1e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7613036930561066, + "num_tokens": 687866776.0, + "step": 2391 + }, + { + "epoch": 0.8520035618878006, + "grad_norm": 0.6487922668457031, + "learning_rate": 1e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7490956485271454, + "num_tokens": 688167431.0, + "step": 2392 + }, + { + "epoch": 0.852359750667854, + "grad_norm": 0.6541228890419006, + "learning_rate": 1e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.7581845223903656, + "num_tokens": 688485842.0, + "step": 2393 + }, + { + "epoch": 0.8527159394479074, + "grad_norm": 0.6781651377677917, + "learning_rate": 1e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7750372141599655, + "num_tokens": 688774929.0, + "step": 2394 + }, + { + "epoch": 0.8530721282279609, + "grad_norm": 0.6651432514190674, + "learning_rate": 1e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7580095529556274, + "num_tokens": 689086290.0, + "step": 2395 + }, + { + "epoch": 0.8534283170080142, + "grad_norm": 0.6393665671348572, + "learning_rate": 1e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7715661376714706, + "num_tokens": 689407791.0, + "step": 2396 + }, + { + "epoch": 0.8537845057880676, + "grad_norm": 0.7054398059844971, + "learning_rate": 1e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.778836652636528, + "num_tokens": 689686262.0, + "step": 2397 + }, + { + "epoch": 0.8541406945681211, + "grad_norm": 0.6497447490692139, + "learning_rate": 1e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.7693967372179031, + "num_tokens": 690000484.0, + "step": 2398 + }, + { + "epoch": 0.8544968833481745, + "grad_norm": 0.7201833724975586, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7652914524078369, + "num_tokens": 690249925.0, + "step": 2399 + }, + { + "epoch": 0.8548530721282279, + "grad_norm": 0.7467626333236694, + "learning_rate": 1e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.7431979477405548, + "num_tokens": 690480719.0, + "step": 2400 + }, + { + "epoch": 0.8552092609082814, + "grad_norm": 0.7026424407958984, + "learning_rate": 1e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7629986703395844, + "num_tokens": 690770955.0, + "step": 2401 + }, + { + "epoch": 0.8555654496883348, + "grad_norm": 0.7547562718391418, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7586901634931564, + "num_tokens": 691044060.0, + "step": 2402 + }, + { + "epoch": 0.8559216384683882, + "grad_norm": 0.6364126801490784, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7616651803255081, + "num_tokens": 691376359.0, + "step": 2403 + }, + { + "epoch": 0.8562778272484417, + "grad_norm": 0.6489138007164001, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7599796205759048, + "num_tokens": 691674969.0, + "step": 2404 + }, + { + "epoch": 0.8566340160284951, + "grad_norm": 0.6512348651885986, + "learning_rate": 1e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.7746245563030243, + "num_tokens": 691979779.0, + "step": 2405 + }, + { + "epoch": 0.8569902048085485, + "grad_norm": 0.6792957782745361, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7568101733922958, + "num_tokens": 692264527.0, + "step": 2406 + }, + { + "epoch": 0.857346393588602, + "grad_norm": 0.6824513077735901, + "learning_rate": 1e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7614880055189133, + "num_tokens": 692553621.0, + "step": 2407 + }, + { + "epoch": 0.8577025823686554, + "grad_norm": 0.6521750688552856, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7657774239778519, + "num_tokens": 692852770.0, + "step": 2408 + }, + { + "epoch": 0.8580587711487088, + "grad_norm": 0.6895369291305542, + "learning_rate": 1e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7740745097398758, + "num_tokens": 693125245.0, + "step": 2409 + }, + { + "epoch": 0.8584149599287623, + "grad_norm": 0.680965781211853, + "learning_rate": 1e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7671957165002823, + "num_tokens": 693430175.0, + "step": 2410 + }, + { + "epoch": 0.8587711487088157, + "grad_norm": 0.6946503520011902, + "learning_rate": 1e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7659398317337036, + "num_tokens": 693736886.0, + "step": 2411 + }, + { + "epoch": 0.859127337488869, + "grad_norm": 0.7245498299598694, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7490795999765396, + "num_tokens": 694003602.0, + "step": 2412 + }, + { + "epoch": 0.8594835262689225, + "grad_norm": 0.6300719380378723, + "learning_rate": 1e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.7582734674215317, + "num_tokens": 694336150.0, + "step": 2413 + }, + { + "epoch": 0.8598397150489759, + "grad_norm": 0.7244418263435364, + "learning_rate": 1e-06, + "loss": 0.6905, + "mean_token_accuracy": 0.7787848562002182, + "num_tokens": 694587872.0, + "step": 2414 + }, + { + "epoch": 0.8601959038290294, + "grad_norm": 0.7955553531646729, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7554849833250046, + "num_tokens": 694849293.0, + "step": 2415 + }, + { + "epoch": 0.8605520926090828, + "grad_norm": 0.6736185550689697, + "learning_rate": 1e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7685449868440628, + "num_tokens": 695139712.0, + "step": 2416 + }, + { + "epoch": 0.8609082813891362, + "grad_norm": 0.6489673256874084, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7620951384305954, + "num_tokens": 695447729.0, + "step": 2417 + }, + { + "epoch": 0.8612644701691897, + "grad_norm": 0.7167323231697083, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.755193442106247, + "num_tokens": 695721525.0, + "step": 2418 + }, + { + "epoch": 0.8616206589492431, + "grad_norm": 0.7641589641571045, + "learning_rate": 1e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7681029289960861, + "num_tokens": 695993088.0, + "step": 2419 + }, + { + "epoch": 0.8619768477292965, + "grad_norm": 0.6638623476028442, + "learning_rate": 1e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7698719352483749, + "num_tokens": 696314347.0, + "step": 2420 + }, + { + "epoch": 0.86233303650935, + "grad_norm": 0.6775693893432617, + "learning_rate": 1e-06, + "loss": 0.842, + "mean_token_accuracy": 0.737791582942009, + "num_tokens": 696625164.0, + "step": 2421 + }, + { + "epoch": 0.8626892252894034, + "grad_norm": 0.735906183719635, + "learning_rate": 1e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7574936151504517, + "num_tokens": 696862425.0, + "step": 2422 + }, + { + "epoch": 0.8630454140694568, + "grad_norm": 0.7426533102989197, + "learning_rate": 1e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7575177401304245, + "num_tokens": 697150941.0, + "step": 2423 + }, + { + "epoch": 0.8634016028495103, + "grad_norm": 0.7182178497314453, + "learning_rate": 1e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7619048953056335, + "num_tokens": 697423730.0, + "step": 2424 + }, + { + "epoch": 0.8637577916295637, + "grad_norm": 0.6491935849189758, + "learning_rate": 1e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7752884030342102, + "num_tokens": 697730066.0, + "step": 2425 + }, + { + "epoch": 0.8641139804096171, + "grad_norm": 0.6800692677497864, + "learning_rate": 1e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7557413429021835, + "num_tokens": 698026056.0, + "step": 2426 + }, + { + "epoch": 0.8644701691896706, + "grad_norm": 0.6753207445144653, + "learning_rate": 1e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7679847776889801, + "num_tokens": 698333517.0, + "step": 2427 + }, + { + "epoch": 0.864826357969724, + "grad_norm": 0.6948007941246033, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.737611249089241, + "num_tokens": 698625487.0, + "step": 2428 + }, + { + "epoch": 0.8651825467497773, + "grad_norm": 0.7139393091201782, + "learning_rate": 1e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7645202130079269, + "num_tokens": 698889336.0, + "step": 2429 + }, + { + "epoch": 0.8655387355298308, + "grad_norm": 0.6308135986328125, + "learning_rate": 1e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7587099820375443, + "num_tokens": 699212234.0, + "step": 2430 + }, + { + "epoch": 0.8658949243098842, + "grad_norm": 0.660066545009613, + "learning_rate": 1e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7629633098840714, + "num_tokens": 699516775.0, + "step": 2431 + }, + { + "epoch": 0.8662511130899376, + "grad_norm": 0.6633830666542053, + "learning_rate": 1e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7606598883867264, + "num_tokens": 699821944.0, + "step": 2432 + }, + { + "epoch": 0.8666073018699911, + "grad_norm": 0.7218127846717834, + "learning_rate": 1e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7782433778047562, + "num_tokens": 700071904.0, + "step": 2433 + }, + { + "epoch": 0.8669634906500445, + "grad_norm": 0.7150934338569641, + "learning_rate": 1e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7549867630004883, + "num_tokens": 700351731.0, + "step": 2434 + }, + { + "epoch": 0.8673196794300979, + "grad_norm": 0.6182823181152344, + "learning_rate": 1e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7795597910881042, + "num_tokens": 700665700.0, + "step": 2435 + }, + { + "epoch": 0.8676758682101514, + "grad_norm": 0.6701721549034119, + "learning_rate": 1e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7519806176424026, + "num_tokens": 700969066.0, + "step": 2436 + }, + { + "epoch": 0.8680320569902048, + "grad_norm": 0.6723957657814026, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7568967789411545, + "num_tokens": 701255568.0, + "step": 2437 + }, + { + "epoch": 0.8683882457702582, + "grad_norm": 0.6549030542373657, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7642615735530853, + "num_tokens": 701551312.0, + "step": 2438 + }, + { + "epoch": 0.8687444345503117, + "grad_norm": 0.7441964745521545, + "learning_rate": 1e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.761664405465126, + "num_tokens": 701827879.0, + "step": 2439 + }, + { + "epoch": 0.8691006233303651, + "grad_norm": 0.6622391939163208, + "learning_rate": 1e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7657350450754166, + "num_tokens": 702157308.0, + "step": 2440 + }, + { + "epoch": 0.8694568121104185, + "grad_norm": 0.7129927277565002, + "learning_rate": 1e-06, + "loss": 0.7631, + "mean_token_accuracy": 0.7596273869276047, + "num_tokens": 702426315.0, + "step": 2441 + }, + { + "epoch": 0.869813000890472, + "grad_norm": 0.6571009755134583, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.773458868265152, + "num_tokens": 702713207.0, + "step": 2442 + }, + { + "epoch": 0.8701691896705254, + "grad_norm": 0.7124523520469666, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7513592839241028, + "num_tokens": 702988756.0, + "step": 2443 + }, + { + "epoch": 0.8705253784505788, + "grad_norm": 0.6641309857368469, + "learning_rate": 1e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7555475533008575, + "num_tokens": 703289964.0, + "step": 2444 + }, + { + "epoch": 0.8708815672306323, + "grad_norm": 0.6760680675506592, + "learning_rate": 1e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.747878223657608, + "num_tokens": 703585014.0, + "step": 2445 + }, + { + "epoch": 0.8712377560106856, + "grad_norm": 0.6965348720550537, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7434747964143753, + "num_tokens": 703864882.0, + "step": 2446 + }, + { + "epoch": 0.871593944790739, + "grad_norm": 0.7085098028182983, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.756559818983078, + "num_tokens": 704149153.0, + "step": 2447 + }, + { + "epoch": 0.8719501335707925, + "grad_norm": 0.7288050055503845, + "learning_rate": 1e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.7540728449821472, + "num_tokens": 704397083.0, + "step": 2448 + }, + { + "epoch": 0.8723063223508459, + "grad_norm": 0.692121148109436, + "learning_rate": 1e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7567595094442368, + "num_tokens": 704667810.0, + "step": 2449 + }, + { + "epoch": 0.8726625111308994, + "grad_norm": 0.6481083631515503, + "learning_rate": 1e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7497033327817917, + "num_tokens": 705001200.0, + "step": 2450 + }, + { + "epoch": 0.8730186999109528, + "grad_norm": 0.6447480916976929, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7637882679700851, + "num_tokens": 705304486.0, + "step": 2451 + }, + { + "epoch": 0.8733748886910062, + "grad_norm": 0.692496657371521, + "learning_rate": 1e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7599953413009644, + "num_tokens": 705587586.0, + "step": 2452 + }, + { + "epoch": 0.8737310774710597, + "grad_norm": 0.684700071811676, + "learning_rate": 1e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7667763084173203, + "num_tokens": 705893855.0, + "step": 2453 + }, + { + "epoch": 0.8740872662511131, + "grad_norm": 0.6479513049125671, + "learning_rate": 1e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7526340037584305, + "num_tokens": 706202487.0, + "step": 2454 + }, + { + "epoch": 0.8744434550311665, + "grad_norm": 0.6629588603973389, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7570133805274963, + "num_tokens": 706539002.0, + "step": 2455 + }, + { + "epoch": 0.87479964381122, + "grad_norm": 0.7791407108306885, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7419565618038177, + "num_tokens": 706795718.0, + "step": 2456 + }, + { + "epoch": 0.8751558325912734, + "grad_norm": 0.6587259769439697, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7640388160943985, + "num_tokens": 707087897.0, + "step": 2457 + }, + { + "epoch": 0.8755120213713268, + "grad_norm": 0.668979287147522, + "learning_rate": 1e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7565833181142807, + "num_tokens": 707345866.0, + "step": 2458 + }, + { + "epoch": 0.8758682101513803, + "grad_norm": 0.7196013927459717, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7664308995008469, + "num_tokens": 707611223.0, + "step": 2459 + }, + { + "epoch": 0.8762243989314337, + "grad_norm": 0.6964629888534546, + "learning_rate": 1e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.762649729847908, + "num_tokens": 707888328.0, + "step": 2460 + }, + { + "epoch": 0.8765805877114871, + "grad_norm": 0.6801053285598755, + "learning_rate": 1e-06, + "loss": 0.6751, + "mean_token_accuracy": 0.7840108275413513, + "num_tokens": 708166858.0, + "step": 2461 + }, + { + "epoch": 0.8769367764915406, + "grad_norm": 0.6950015425682068, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7618553340435028, + "num_tokens": 708459786.0, + "step": 2462 + }, + { + "epoch": 0.877292965271594, + "grad_norm": 0.6847928762435913, + "learning_rate": 1e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7620252370834351, + "num_tokens": 708742175.0, + "step": 2463 + }, + { + "epoch": 0.8776491540516473, + "grad_norm": 0.7382085919380188, + "learning_rate": 1e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7455883324146271, + "num_tokens": 709009163.0, + "step": 2464 + }, + { + "epoch": 0.8780053428317008, + "grad_norm": 0.8057679533958435, + "learning_rate": 1e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.771762028336525, + "num_tokens": 709285119.0, + "step": 2465 + }, + { + "epoch": 0.8783615316117542, + "grad_norm": 0.6454748511314392, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7527047842741013, + "num_tokens": 709607280.0, + "step": 2466 + }, + { + "epoch": 0.8787177203918076, + "grad_norm": 0.7181129455566406, + "learning_rate": 1e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7493198364973068, + "num_tokens": 709882144.0, + "step": 2467 + }, + { + "epoch": 0.8790739091718611, + "grad_norm": 0.7462397217750549, + "learning_rate": 1e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.7537310719490051, + "num_tokens": 710146885.0, + "step": 2468 + }, + { + "epoch": 0.8794300979519145, + "grad_norm": 0.7311971783638, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7463038116693497, + "num_tokens": 710406227.0, + "step": 2469 + }, + { + "epoch": 0.8797862867319679, + "grad_norm": 0.7098777294158936, + "learning_rate": 1e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7716893255710602, + "num_tokens": 710670804.0, + "step": 2470 + }, + { + "epoch": 0.8801424755120214, + "grad_norm": 0.7306983470916748, + "learning_rate": 1e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7542220205068588, + "num_tokens": 710937056.0, + "step": 2471 + }, + { + "epoch": 0.8804986642920748, + "grad_norm": 0.6974586248397827, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.736962080001831, + "num_tokens": 711235194.0, + "step": 2472 + }, + { + "epoch": 0.8808548530721282, + "grad_norm": 0.6774189472198486, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7619919329881668, + "num_tokens": 711523203.0, + "step": 2473 + }, + { + "epoch": 0.8812110418521817, + "grad_norm": 0.75505530834198, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7553759962320328, + "num_tokens": 711801429.0, + "step": 2474 + }, + { + "epoch": 0.8815672306322351, + "grad_norm": 0.6741814613342285, + "learning_rate": 1e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7382449805736542, + "num_tokens": 712094540.0, + "step": 2475 + }, + { + "epoch": 0.8819234194122885, + "grad_norm": 0.672813355922699, + "learning_rate": 1e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.7782243639230728, + "num_tokens": 712359006.0, + "step": 2476 + }, + { + "epoch": 0.882279608192342, + "grad_norm": 0.7520366907119751, + "learning_rate": 1e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7563323527574539, + "num_tokens": 712608675.0, + "step": 2477 + }, + { + "epoch": 0.8826357969723954, + "grad_norm": 0.7235170602798462, + "learning_rate": 1e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.761713832616806, + "num_tokens": 712891800.0, + "step": 2478 + }, + { + "epoch": 0.8829919857524487, + "grad_norm": 0.6766006946563721, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7421709150075912, + "num_tokens": 713197099.0, + "step": 2479 + }, + { + "epoch": 0.8833481745325023, + "grad_norm": 0.676579475402832, + "learning_rate": 1e-06, + "loss": 0.6907, + "mean_token_accuracy": 0.7723901271820068, + "num_tokens": 713471257.0, + "step": 2480 + }, + { + "epoch": 0.8837043633125556, + "grad_norm": 0.6912738680839539, + "learning_rate": 1e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.7617654800415039, + "num_tokens": 713756822.0, + "step": 2481 + }, + { + "epoch": 0.884060552092609, + "grad_norm": 0.6819356083869934, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7431947141885757, + "num_tokens": 714068238.0, + "step": 2482 + }, + { + "epoch": 0.8844167408726625, + "grad_norm": 0.6692376136779785, + "learning_rate": 1e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7550634145736694, + "num_tokens": 714379963.0, + "step": 2483 + }, + { + "epoch": 0.8847729296527159, + "grad_norm": 0.7308802008628845, + "learning_rate": 1e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7472765147686005, + "num_tokens": 714654153.0, + "step": 2484 + }, + { + "epoch": 0.8851291184327693, + "grad_norm": 0.7156696915626526, + "learning_rate": 1e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.773243859410286, + "num_tokens": 714945792.0, + "step": 2485 + }, + { + "epoch": 0.8854853072128228, + "grad_norm": 0.7005816698074341, + "learning_rate": 1e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7524689733982086, + "num_tokens": 715217991.0, + "step": 2486 + }, + { + "epoch": 0.8858414959928762, + "grad_norm": 0.6588144302368164, + "learning_rate": 1e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7619762420654297, + "num_tokens": 715532247.0, + "step": 2487 + }, + { + "epoch": 0.8861976847729297, + "grad_norm": 0.6994578838348389, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7644343078136444, + "num_tokens": 715795702.0, + "step": 2488 + }, + { + "epoch": 0.8865538735529831, + "grad_norm": 0.691099226474762, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.769805982708931, + "num_tokens": 716071613.0, + "step": 2489 + }, + { + "epoch": 0.8869100623330365, + "grad_norm": 0.7195239067077637, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7599294632673264, + "num_tokens": 716322482.0, + "step": 2490 + }, + { + "epoch": 0.88726625111309, + "grad_norm": 0.7116140723228455, + "learning_rate": 1e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7760369181632996, + "num_tokens": 716604013.0, + "step": 2491 + }, + { + "epoch": 0.8876224398931434, + "grad_norm": 0.6858648657798767, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.763879582285881, + "num_tokens": 716890175.0, + "step": 2492 + }, + { + "epoch": 0.8879786286731968, + "grad_norm": 0.6672384738922119, + "learning_rate": 1e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7687101066112518, + "num_tokens": 717175250.0, + "step": 2493 + }, + { + "epoch": 0.8883348174532503, + "grad_norm": 0.6792609095573425, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7429053634405136, + "num_tokens": 717458657.0, + "step": 2494 + }, + { + "epoch": 0.8886910062333037, + "grad_norm": 0.7256088852882385, + "learning_rate": 1e-06, + "loss": 0.6779, + "mean_token_accuracy": 0.7834853231906891, + "num_tokens": 717738507.0, + "step": 2495 + }, + { + "epoch": 0.889047195013357, + "grad_norm": 0.7011347413063049, + "learning_rate": 1e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7460843324661255, + "num_tokens": 717991632.0, + "step": 2496 + }, + { + "epoch": 0.8894033837934106, + "grad_norm": 0.6967030763626099, + "learning_rate": 1e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7638818472623825, + "num_tokens": 718275231.0, + "step": 2497 + }, + { + "epoch": 0.8897595725734639, + "grad_norm": 0.6824765205383301, + "learning_rate": 1e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7693362832069397, + "num_tokens": 718577908.0, + "step": 2498 + }, + { + "epoch": 0.8901157613535173, + "grad_norm": 0.683871328830719, + "learning_rate": 1e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7804124653339386, + "num_tokens": 718871166.0, + "step": 2499 + }, + { + "epoch": 0.8904719501335708, + "grad_norm": 0.693622350692749, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7413828074932098, + "num_tokens": 719172931.0, + "step": 2500 + }, + { + "epoch": 0.8908281389136242, + "grad_norm": 0.6919347047805786, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.74769027531147, + "num_tokens": 719446372.0, + "step": 2501 + }, + { + "epoch": 0.8911843276936776, + "grad_norm": 0.6751816868782043, + "learning_rate": 1e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7708669900894165, + "num_tokens": 719776927.0, + "step": 2502 + }, + { + "epoch": 0.8915405164737311, + "grad_norm": 0.6863644123077393, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.771586999297142, + "num_tokens": 720054454.0, + "step": 2503 + }, + { + "epoch": 0.8918967052537845, + "grad_norm": 0.7297400832176208, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7326669842004776, + "num_tokens": 720315006.0, + "step": 2504 + }, + { + "epoch": 0.8922528940338379, + "grad_norm": 0.6767438650131226, + "learning_rate": 1e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7630958259105682, + "num_tokens": 720584918.0, + "step": 2505 + }, + { + "epoch": 0.8926090828138914, + "grad_norm": 0.6785357594490051, + "learning_rate": 1e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7628059983253479, + "num_tokens": 720869297.0, + "step": 2506 + }, + { + "epoch": 0.8929652715939448, + "grad_norm": 0.6992804408073425, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.754859060049057, + "num_tokens": 721153755.0, + "step": 2507 + }, + { + "epoch": 0.8933214603739982, + "grad_norm": 0.6842201352119446, + "learning_rate": 1e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.779353067278862, + "num_tokens": 721425524.0, + "step": 2508 + }, + { + "epoch": 0.8936776491540517, + "grad_norm": 0.6978436708450317, + "learning_rate": 1e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7621790766716003, + "num_tokens": 721716088.0, + "step": 2509 + }, + { + "epoch": 0.8940338379341051, + "grad_norm": 0.6590428352355957, + "learning_rate": 1e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7531222105026245, + "num_tokens": 722027461.0, + "step": 2510 + }, + { + "epoch": 0.8943900267141585, + "grad_norm": 0.6723711490631104, + "learning_rate": 1e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7713529914617538, + "num_tokens": 722301060.0, + "step": 2511 + }, + { + "epoch": 0.894746215494212, + "grad_norm": 0.6725243330001831, + "learning_rate": 1e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7844805121421814, + "num_tokens": 722596493.0, + "step": 2512 + }, + { + "epoch": 0.8951024042742654, + "grad_norm": 0.6492499113082886, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.764170303940773, + "num_tokens": 722918289.0, + "step": 2513 + }, + { + "epoch": 0.8954585930543187, + "grad_norm": 0.6853066682815552, + "learning_rate": 1e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7590610533952713, + "num_tokens": 723189138.0, + "step": 2514 + }, + { + "epoch": 0.8958147818343722, + "grad_norm": 0.6521274447441101, + "learning_rate": 1e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.7757738679647446, + "num_tokens": 723500928.0, + "step": 2515 + }, + { + "epoch": 0.8961709706144256, + "grad_norm": 0.712171196937561, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7656765282154083, + "num_tokens": 723770476.0, + "step": 2516 + }, + { + "epoch": 0.896527159394479, + "grad_norm": 0.7031210660934448, + "learning_rate": 1e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7481237053871155, + "num_tokens": 724053896.0, + "step": 2517 + }, + { + "epoch": 0.8968833481745325, + "grad_norm": 0.6690787672996521, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7502534091472626, + "num_tokens": 724341294.0, + "step": 2518 + }, + { + "epoch": 0.8972395369545859, + "grad_norm": 0.642870306968689, + "learning_rate": 1e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7703318893909454, + "num_tokens": 724668105.0, + "step": 2519 + }, + { + "epoch": 0.8975957257346393, + "grad_norm": 0.6740546822547913, + "learning_rate": 1e-06, + "loss": 0.774, + "mean_token_accuracy": 0.7555363774299622, + "num_tokens": 724966805.0, + "step": 2520 + }, + { + "epoch": 0.8979519145146928, + "grad_norm": 0.6989254355430603, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7625323981046677, + "num_tokens": 725247244.0, + "step": 2521 + }, + { + "epoch": 0.8983081032947462, + "grad_norm": 0.7043266296386719, + "learning_rate": 1e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7474681735038757, + "num_tokens": 725550309.0, + "step": 2522 + }, + { + "epoch": 0.8986642920747997, + "grad_norm": 0.6756611466407776, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7538847774267197, + "num_tokens": 725825694.0, + "step": 2523 + }, + { + "epoch": 0.8990204808548531, + "grad_norm": 0.6892859935760498, + "learning_rate": 1e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.771716833114624, + "num_tokens": 726074610.0, + "step": 2524 + }, + { + "epoch": 0.8993766696349065, + "grad_norm": 0.6959080100059509, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7636406719684601, + "num_tokens": 726363154.0, + "step": 2525 + }, + { + "epoch": 0.89973285841496, + "grad_norm": 0.7062594294548035, + "learning_rate": 1e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7637770473957062, + "num_tokens": 726633604.0, + "step": 2526 + }, + { + "epoch": 0.9000890471950134, + "grad_norm": 0.6845508813858032, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7610510587692261, + "num_tokens": 726917826.0, + "step": 2527 + }, + { + "epoch": 0.9004452359750668, + "grad_norm": 0.6582940816879272, + "learning_rate": 1e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.767650842666626, + "num_tokens": 727231580.0, + "step": 2528 + }, + { + "epoch": 0.9008014247551203, + "grad_norm": 0.670625627040863, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7644346505403519, + "num_tokens": 727524992.0, + "step": 2529 + }, + { + "epoch": 0.9011576135351737, + "grad_norm": 0.6449087262153625, + "learning_rate": 1e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7555203884840012, + "num_tokens": 727842710.0, + "step": 2530 + }, + { + "epoch": 0.901513802315227, + "grad_norm": 0.694887101650238, + "learning_rate": 1e-06, + "loss": 0.8016, + "mean_token_accuracy": 0.7552567273378372, + "num_tokens": 728145146.0, + "step": 2531 + }, + { + "epoch": 0.9018699910952805, + "grad_norm": 0.6894049048423767, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7397135645151138, + "num_tokens": 728417040.0, + "step": 2532 + }, + { + "epoch": 0.9022261798753339, + "grad_norm": 0.6978943347930908, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7525843530893326, + "num_tokens": 728695059.0, + "step": 2533 + }, + { + "epoch": 0.9025823686553873, + "grad_norm": 0.6367285847663879, + "learning_rate": 1e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7627434432506561, + "num_tokens": 729026190.0, + "step": 2534 + }, + { + "epoch": 0.9029385574354408, + "grad_norm": 0.6891121864318848, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7640596032142639, + "num_tokens": 729329346.0, + "step": 2535 + }, + { + "epoch": 0.9032947462154942, + "grad_norm": 0.7017749547958374, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7549757808446884, + "num_tokens": 729604798.0, + "step": 2536 + }, + { + "epoch": 0.9036509349955476, + "grad_norm": 0.7001579999923706, + "learning_rate": 1e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7582866847515106, + "num_tokens": 729907547.0, + "step": 2537 + }, + { + "epoch": 0.9040071237756011, + "grad_norm": 0.624547004699707, + "learning_rate": 1e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7505633682012558, + "num_tokens": 730245110.0, + "step": 2538 + }, + { + "epoch": 0.9043633125556545, + "grad_norm": 0.6873339414596558, + "learning_rate": 1e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7608938664197922, + "num_tokens": 730526027.0, + "step": 2539 + }, + { + "epoch": 0.9047195013357079, + "grad_norm": 0.6599506735801697, + "learning_rate": 1e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7536908984184265, + "num_tokens": 730823442.0, + "step": 2540 + }, + { + "epoch": 0.9050756901157614, + "grad_norm": 0.7200509309768677, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7595397979021072, + "num_tokens": 731083463.0, + "step": 2541 + }, + { + "epoch": 0.9054318788958148, + "grad_norm": 0.6581294536590576, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7649051696062088, + "num_tokens": 731373856.0, + "step": 2542 + }, + { + "epoch": 0.9057880676758682, + "grad_norm": 0.6388940811157227, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7586410343647003, + "num_tokens": 731672059.0, + "step": 2543 + }, + { + "epoch": 0.9061442564559217, + "grad_norm": 0.7214067578315735, + "learning_rate": 1e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.740902304649353, + "num_tokens": 731939740.0, + "step": 2544 + }, + { + "epoch": 0.9065004452359751, + "grad_norm": 0.6697251796722412, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7597993016242981, + "num_tokens": 732224558.0, + "step": 2545 + }, + { + "epoch": 0.9068566340160285, + "grad_norm": 0.6979211568832397, + "learning_rate": 1e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7686919420957565, + "num_tokens": 732497637.0, + "step": 2546 + }, + { + "epoch": 0.907212822796082, + "grad_norm": 0.6935979127883911, + "learning_rate": 1e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7654079645872116, + "num_tokens": 732791768.0, + "step": 2547 + }, + { + "epoch": 0.9075690115761353, + "grad_norm": 0.6725670099258423, + "learning_rate": 1e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7654314786195755, + "num_tokens": 733076018.0, + "step": 2548 + }, + { + "epoch": 0.9079252003561887, + "grad_norm": 0.6742280721664429, + "learning_rate": 1e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7580274492502213, + "num_tokens": 733367080.0, + "step": 2549 + }, + { + "epoch": 0.9082813891362422, + "grad_norm": 0.7087217569351196, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7400040477514267, + "num_tokens": 733646555.0, + "step": 2550 + }, + { + "epoch": 0.9086375779162956, + "grad_norm": 0.735878586769104, + "learning_rate": 1e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7629302144050598, + "num_tokens": 733876915.0, + "step": 2551 + }, + { + "epoch": 0.908993766696349, + "grad_norm": 0.7265425324440002, + "learning_rate": 1e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.763511523604393, + "num_tokens": 734143463.0, + "step": 2552 + }, + { + "epoch": 0.9093499554764025, + "grad_norm": 0.6876114010810852, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7652640044689178, + "num_tokens": 734436703.0, + "step": 2553 + }, + { + "epoch": 0.9097061442564559, + "grad_norm": 0.6717581152915955, + "learning_rate": 1e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7710043340921402, + "num_tokens": 734730323.0, + "step": 2554 + }, + { + "epoch": 0.9100623330365093, + "grad_norm": 0.6490539312362671, + "learning_rate": 1e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7775721848011017, + "num_tokens": 735048557.0, + "step": 2555 + }, + { + "epoch": 0.9104185218165628, + "grad_norm": 0.6739974617958069, + "learning_rate": 1e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.7732713967561722, + "num_tokens": 735330573.0, + "step": 2556 + }, + { + "epoch": 0.9107747105966162, + "grad_norm": 0.7050812244415283, + "learning_rate": 1e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7526630461215973, + "num_tokens": 735629873.0, + "step": 2557 + }, + { + "epoch": 0.9111308993766697, + "grad_norm": 0.7510181069374084, + "learning_rate": 1e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7576088905334473, + "num_tokens": 735869580.0, + "step": 2558 + }, + { + "epoch": 0.9114870881567231, + "grad_norm": 0.7089340090751648, + "learning_rate": 1e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7570234090089798, + "num_tokens": 736112003.0, + "step": 2559 + }, + { + "epoch": 0.9118432769367765, + "grad_norm": 0.6782230138778687, + "learning_rate": 1e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7662149667739868, + "num_tokens": 736406074.0, + "step": 2560 + }, + { + "epoch": 0.91219946571683, + "grad_norm": 0.6830840706825256, + "learning_rate": 1e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7648216336965561, + "num_tokens": 736693436.0, + "step": 2561 + }, + { + "epoch": 0.9125556544968834, + "grad_norm": 0.7131302356719971, + "learning_rate": 1e-06, + "loss": 0.8121, + "mean_token_accuracy": 0.7441070675849915, + "num_tokens": 736956086.0, + "step": 2562 + }, + { + "epoch": 0.9129118432769368, + "grad_norm": 0.6923241019248962, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7486142367124557, + "num_tokens": 737242792.0, + "step": 2563 + }, + { + "epoch": 0.9132680320569903, + "grad_norm": 0.7130152583122253, + "learning_rate": 1e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7644911557435989, + "num_tokens": 737505968.0, + "step": 2564 + }, + { + "epoch": 0.9136242208370436, + "grad_norm": 0.7417703866958618, + "learning_rate": 1e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.759172648191452, + "num_tokens": 737783733.0, + "step": 2565 + }, + { + "epoch": 0.913980409617097, + "grad_norm": 0.7040385007858276, + "learning_rate": 1e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7745690792798996, + "num_tokens": 738085405.0, + "step": 2566 + }, + { + "epoch": 0.9143365983971505, + "grad_norm": 0.6663094758987427, + "learning_rate": 1e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7660122364759445, + "num_tokens": 738390813.0, + "step": 2567 + }, + { + "epoch": 0.9146927871772039, + "grad_norm": 0.7152611613273621, + "learning_rate": 1e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7556240409612656, + "num_tokens": 738654954.0, + "step": 2568 + }, + { + "epoch": 0.9150489759572573, + "grad_norm": 0.7417580485343933, + "learning_rate": 1e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.774662971496582, + "num_tokens": 738928865.0, + "step": 2569 + }, + { + "epoch": 0.9154051647373108, + "grad_norm": 0.6793308854103088, + "learning_rate": 1e-06, + "loss": 0.7905, + "mean_token_accuracy": 0.7502871751785278, + "num_tokens": 739194220.0, + "step": 2570 + }, + { + "epoch": 0.9157613535173642, + "grad_norm": 0.7063408493995667, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7607257664203644, + "num_tokens": 739444537.0, + "step": 2571 + }, + { + "epoch": 0.9161175422974176, + "grad_norm": 0.7089411616325378, + "learning_rate": 1e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7578674107789993, + "num_tokens": 739762309.0, + "step": 2572 + }, + { + "epoch": 0.9164737310774711, + "grad_norm": 0.7055795192718506, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7706003785133362, + "num_tokens": 740050695.0, + "step": 2573 + }, + { + "epoch": 0.9168299198575245, + "grad_norm": 0.6903508901596069, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7780681550502777, + "num_tokens": 740323487.0, + "step": 2574 + }, + { + "epoch": 0.9171861086375779, + "grad_norm": 0.6786745190620422, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7548892647027969, + "num_tokens": 740625252.0, + "step": 2575 + }, + { + "epoch": 0.9175422974176314, + "grad_norm": 0.7312742471694946, + "learning_rate": 1e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7548792064189911, + "num_tokens": 740917025.0, + "step": 2576 + }, + { + "epoch": 0.9178984861976848, + "grad_norm": 0.6611191034317017, + "learning_rate": 1e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7524598687887192, + "num_tokens": 741228528.0, + "step": 2577 + }, + { + "epoch": 0.9182546749777382, + "grad_norm": 0.6706463098526001, + "learning_rate": 1e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7787367105484009, + "num_tokens": 741512260.0, + "step": 2578 + }, + { + "epoch": 0.9186108637577917, + "grad_norm": 0.7482878565788269, + "learning_rate": 1e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.746290847659111, + "num_tokens": 741757681.0, + "step": 2579 + }, + { + "epoch": 0.918967052537845, + "grad_norm": 0.7240157127380371, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7568031400442123, + "num_tokens": 742039759.0, + "step": 2580 + }, + { + "epoch": 0.9193232413178984, + "grad_norm": 0.6854937672615051, + "learning_rate": 1e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7709170430898666, + "num_tokens": 742321007.0, + "step": 2581 + }, + { + "epoch": 0.9196794300979519, + "grad_norm": 0.7166152596473694, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7456384003162384, + "num_tokens": 742630970.0, + "step": 2582 + }, + { + "epoch": 0.9200356188780053, + "grad_norm": 0.6460461020469666, + "learning_rate": 1e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7611159682273865, + "num_tokens": 742939121.0, + "step": 2583 + }, + { + "epoch": 0.9203918076580587, + "grad_norm": 0.6306645274162292, + "learning_rate": 1e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7708381712436676, + "num_tokens": 743264714.0, + "step": 2584 + }, + { + "epoch": 0.9207479964381122, + "grad_norm": 0.6381183862686157, + "learning_rate": 1e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7596455216407776, + "num_tokens": 743578734.0, + "step": 2585 + }, + { + "epoch": 0.9211041852181656, + "grad_norm": 0.6729536652565002, + "learning_rate": 1e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7620727270841599, + "num_tokens": 743855753.0, + "step": 2586 + }, + { + "epoch": 0.921460373998219, + "grad_norm": 0.6557164788246155, + "learning_rate": 1e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7792374938726425, + "num_tokens": 744147460.0, + "step": 2587 + }, + { + "epoch": 0.9218165627782725, + "grad_norm": 0.6798202991485596, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.75322026014328, + "num_tokens": 744415790.0, + "step": 2588 + }, + { + "epoch": 0.9221727515583259, + "grad_norm": 0.6595281958580017, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7720719426870346, + "num_tokens": 744692020.0, + "step": 2589 + }, + { + "epoch": 0.9225289403383793, + "grad_norm": 0.6430357098579407, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7454604506492615, + "num_tokens": 745010526.0, + "step": 2590 + }, + { + "epoch": 0.9228851291184328, + "grad_norm": 0.6682298183441162, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7620165050029755, + "num_tokens": 745321619.0, + "step": 2591 + }, + { + "epoch": 0.9232413178984862, + "grad_norm": 0.6548498272895813, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7696680575609207, + "num_tokens": 745616627.0, + "step": 2592 + }, + { + "epoch": 0.9235975066785397, + "grad_norm": 0.6781511306762695, + "learning_rate": 1e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7698836624622345, + "num_tokens": 745894071.0, + "step": 2593 + }, + { + "epoch": 0.9239536954585931, + "grad_norm": 0.726268470287323, + "learning_rate": 1e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.7540027797222137, + "num_tokens": 746155506.0, + "step": 2594 + }, + { + "epoch": 0.9243098842386465, + "grad_norm": 0.6376801133155823, + "learning_rate": 1e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7602002769708633, + "num_tokens": 746480371.0, + "step": 2595 + }, + { + "epoch": 0.9246660730187, + "grad_norm": 0.7028287649154663, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7635394930839539, + "num_tokens": 746742651.0, + "step": 2596 + }, + { + "epoch": 0.9250222617987534, + "grad_norm": 0.698856770992279, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7474882900714874, + "num_tokens": 747032033.0, + "step": 2597 + }, + { + "epoch": 0.9253784505788067, + "grad_norm": 0.7017821073532104, + "learning_rate": 1e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7592183351516724, + "num_tokens": 747312446.0, + "step": 2598 + }, + { + "epoch": 0.9257346393588602, + "grad_norm": 0.6883251667022705, + "learning_rate": 1e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7501732409000397, + "num_tokens": 747582664.0, + "step": 2599 + }, + { + "epoch": 0.9260908281389136, + "grad_norm": 0.6586000323295593, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7462805807590485, + "num_tokens": 747884944.0, + "step": 2600 + }, + { + "epoch": 0.926447016918967, + "grad_norm": 0.6058696508407593, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7631751745939255, + "num_tokens": 748242746.0, + "step": 2601 + }, + { + "epoch": 0.9268032056990205, + "grad_norm": 0.6886298656463623, + "learning_rate": 1e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7584025263786316, + "num_tokens": 748537010.0, + "step": 2602 + }, + { + "epoch": 0.9271593944790739, + "grad_norm": 0.6683538556098938, + "learning_rate": 1e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.7748991549015045, + "num_tokens": 748821542.0, + "step": 2603 + }, + { + "epoch": 0.9275155832591273, + "grad_norm": 0.6438900232315063, + "learning_rate": 1e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.7880685776472092, + "num_tokens": 749129765.0, + "step": 2604 + }, + { + "epoch": 0.9278717720391808, + "grad_norm": 0.6677286028862, + "learning_rate": 1e-06, + "loss": 0.706, + "mean_token_accuracy": 0.774739220738411, + "num_tokens": 749416906.0, + "step": 2605 + }, + { + "epoch": 0.9282279608192342, + "grad_norm": 0.6409021019935608, + "learning_rate": 1e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7659482806921005, + "num_tokens": 749729420.0, + "step": 2606 + }, + { + "epoch": 0.9285841495992876, + "grad_norm": 0.6633399724960327, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.759553074836731, + "num_tokens": 750048407.0, + "step": 2607 + }, + { + "epoch": 0.9289403383793411, + "grad_norm": 0.6562366485595703, + "learning_rate": 1e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7520754486322403, + "num_tokens": 750362503.0, + "step": 2608 + }, + { + "epoch": 0.9292965271593945, + "grad_norm": 0.6319327354431152, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7613875418901443, + "num_tokens": 750680165.0, + "step": 2609 + }, + { + "epoch": 0.9296527159394479, + "grad_norm": 0.6332548260688782, + "learning_rate": 1e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7694965749979019, + "num_tokens": 750980958.0, + "step": 2610 + }, + { + "epoch": 0.9300089047195014, + "grad_norm": 0.6571313142776489, + "learning_rate": 1e-06, + "loss": 0.6799, + "mean_token_accuracy": 0.7815006375312805, + "num_tokens": 751276690.0, + "step": 2611 + }, + { + "epoch": 0.9303650934995548, + "grad_norm": 0.6997376680374146, + "learning_rate": 1e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7648434937000275, + "num_tokens": 751554934.0, + "step": 2612 + }, + { + "epoch": 0.9307212822796082, + "grad_norm": 0.6536148190498352, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7612162977457047, + "num_tokens": 751861021.0, + "step": 2613 + }, + { + "epoch": 0.9310774710596617, + "grad_norm": 0.6999965310096741, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7670154720544815, + "num_tokens": 752163989.0, + "step": 2614 + }, + { + "epoch": 0.931433659839715, + "grad_norm": 0.6911618113517761, + "learning_rate": 1e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7712168097496033, + "num_tokens": 752450478.0, + "step": 2615 + }, + { + "epoch": 0.9317898486197684, + "grad_norm": 0.6500999331474304, + "learning_rate": 1e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7676194906234741, + "num_tokens": 752729259.0, + "step": 2616 + }, + { + "epoch": 0.9321460373998219, + "grad_norm": 0.7428213357925415, + "learning_rate": 1e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7504156231880188, + "num_tokens": 752998699.0, + "step": 2617 + }, + { + "epoch": 0.9325022261798753, + "grad_norm": 0.624297559261322, + "learning_rate": 1e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7743678241968155, + "num_tokens": 753305889.0, + "step": 2618 + }, + { + "epoch": 0.9328584149599287, + "grad_norm": 0.6424413919448853, + "learning_rate": 1e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7464075833559036, + "num_tokens": 753634075.0, + "step": 2619 + }, + { + "epoch": 0.9332146037399822, + "grad_norm": 0.733549952507019, + "learning_rate": 1e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.7739159315824509, + "num_tokens": 753900181.0, + "step": 2620 + }, + { + "epoch": 0.9335707925200356, + "grad_norm": 0.6687577366828918, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7682770043611526, + "num_tokens": 754186963.0, + "step": 2621 + }, + { + "epoch": 0.933926981300089, + "grad_norm": 0.6393203139305115, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7712734341621399, + "num_tokens": 754500385.0, + "step": 2622 + }, + { + "epoch": 0.9342831700801425, + "grad_norm": 0.6523320078849792, + "learning_rate": 1e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7785117328166962, + "num_tokens": 754775100.0, + "step": 2623 + }, + { + "epoch": 0.9346393588601959, + "grad_norm": 0.6838816404342651, + "learning_rate": 1e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.7445654422044754, + "num_tokens": 755052308.0, + "step": 2624 + }, + { + "epoch": 0.9349955476402493, + "grad_norm": 0.6402662396430969, + "learning_rate": 1e-06, + "loss": 0.7303, + "mean_token_accuracy": 0.7688082009553909, + "num_tokens": 755373036.0, + "step": 2625 + }, + { + "epoch": 0.9353517364203028, + "grad_norm": 0.6665554642677307, + "learning_rate": 1e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.76254902780056, + "num_tokens": 755642187.0, + "step": 2626 + }, + { + "epoch": 0.9357079252003562, + "grad_norm": 0.6417909860610962, + "learning_rate": 1e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.754176676273346, + "num_tokens": 755949019.0, + "step": 2627 + }, + { + "epoch": 0.9360641139804096, + "grad_norm": 0.7373785376548767, + "learning_rate": 1e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7701837122440338, + "num_tokens": 756195778.0, + "step": 2628 + }, + { + "epoch": 0.9364203027604631, + "grad_norm": 0.7028212547302246, + "learning_rate": 1e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7568409144878387, + "num_tokens": 756471089.0, + "step": 2629 + }, + { + "epoch": 0.9367764915405165, + "grad_norm": 0.7069362998008728, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7432703077793121, + "num_tokens": 756739663.0, + "step": 2630 + }, + { + "epoch": 0.93713268032057, + "grad_norm": 0.6755791306495667, + "learning_rate": 1e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7654891163110733, + "num_tokens": 757031124.0, + "step": 2631 + }, + { + "epoch": 0.9374888691006233, + "grad_norm": 0.6399267911911011, + "learning_rate": 1e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7759774476289749, + "num_tokens": 757327415.0, + "step": 2632 + }, + { + "epoch": 0.9378450578806767, + "grad_norm": 0.6745613217353821, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7664054185152054, + "num_tokens": 757605527.0, + "step": 2633 + }, + { + "epoch": 0.9382012466607302, + "grad_norm": 0.6860653162002563, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7428642213344574, + "num_tokens": 757906124.0, + "step": 2634 + }, + { + "epoch": 0.9385574354407836, + "grad_norm": 0.6765120625495911, + "learning_rate": 1e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7577037662267685, + "num_tokens": 758189891.0, + "step": 2635 + }, + { + "epoch": 0.938913624220837, + "grad_norm": 0.6806002259254456, + "learning_rate": 1e-06, + "loss": 0.6922, + "mean_token_accuracy": 0.7775188684463501, + "num_tokens": 758476611.0, + "step": 2636 + }, + { + "epoch": 0.9392698130008905, + "grad_norm": 0.716530442237854, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7312114089727402, + "num_tokens": 758735801.0, + "step": 2637 + }, + { + "epoch": 0.9396260017809439, + "grad_norm": 0.6624738574028015, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7547862082719803, + "num_tokens": 759027139.0, + "step": 2638 + }, + { + "epoch": 0.9399821905609973, + "grad_norm": 0.7309219241142273, + "learning_rate": 1e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7366128414869308, + "num_tokens": 759296749.0, + "step": 2639 + }, + { + "epoch": 0.9403383793410508, + "grad_norm": 0.6459267735481262, + "learning_rate": 1e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.7613812983036041, + "num_tokens": 759590101.0, + "step": 2640 + }, + { + "epoch": 0.9406945681211042, + "grad_norm": 0.6798425316810608, + "learning_rate": 1e-06, + "loss": 0.7368, + "mean_token_accuracy": 0.7662416398525238, + "num_tokens": 759858227.0, + "step": 2641 + }, + { + "epoch": 0.9410507569011576, + "grad_norm": 0.6775726079940796, + "learning_rate": 1e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7658861726522446, + "num_tokens": 760123377.0, + "step": 2642 + }, + { + "epoch": 0.9414069456812111, + "grad_norm": 0.6519104242324829, + "learning_rate": 1e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7666364461183548, + "num_tokens": 760421865.0, + "step": 2643 + }, + { + "epoch": 0.9417631344612645, + "grad_norm": 0.7023690342903137, + "learning_rate": 1e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7593126595020294, + "num_tokens": 760678185.0, + "step": 2644 + }, + { + "epoch": 0.9421193232413179, + "grad_norm": 0.6722052097320557, + "learning_rate": 1e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.7541027963161469, + "num_tokens": 760955524.0, + "step": 2645 + }, + { + "epoch": 0.9424755120213714, + "grad_norm": 0.6550822257995605, + "learning_rate": 1e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7623931765556335, + "num_tokens": 761257773.0, + "step": 2646 + }, + { + "epoch": 0.9428317008014248, + "grad_norm": 0.7211658954620361, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7579507827758789, + "num_tokens": 761545402.0, + "step": 2647 + }, + { + "epoch": 0.9431878895814781, + "grad_norm": 0.6724604964256287, + "learning_rate": 1e-06, + "loss": 0.7895, + "mean_token_accuracy": 0.7497093379497528, + "num_tokens": 761830631.0, + "step": 2648 + }, + { + "epoch": 0.9435440783615316, + "grad_norm": 0.6892109513282776, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7619462311267853, + "num_tokens": 762097269.0, + "step": 2649 + }, + { + "epoch": 0.943900267141585, + "grad_norm": 0.6414384841918945, + "learning_rate": 1e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7695296555757523, + "num_tokens": 762389876.0, + "step": 2650 + }, + { + "epoch": 0.9442564559216384, + "grad_norm": 0.672407329082489, + "learning_rate": 1e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7667032033205032, + "num_tokens": 762676989.0, + "step": 2651 + }, + { + "epoch": 0.9446126447016919, + "grad_norm": 0.6805648803710938, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7327083647251129, + "num_tokens": 762958916.0, + "step": 2652 + }, + { + "epoch": 0.9449688334817453, + "grad_norm": 0.6646837592124939, + "learning_rate": 1e-06, + "loss": 0.767, + "mean_token_accuracy": 0.7591285854578018, + "num_tokens": 763252651.0, + "step": 2653 + }, + { + "epoch": 0.9453250222617987, + "grad_norm": 0.6801563501358032, + "learning_rate": 1e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7542224526405334, + "num_tokens": 763537985.0, + "step": 2654 + }, + { + "epoch": 0.9456812110418522, + "grad_norm": 0.6369266510009766, + "learning_rate": 1e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7444077730178833, + "num_tokens": 763833101.0, + "step": 2655 + }, + { + "epoch": 0.9460373998219056, + "grad_norm": 0.6747071146965027, + "learning_rate": 1e-06, + "loss": 0.7747, + "mean_token_accuracy": 0.759064644575119, + "num_tokens": 764105611.0, + "step": 2656 + }, + { + "epoch": 0.946393588601959, + "grad_norm": 0.6830713152885437, + "learning_rate": 1e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7436378002166748, + "num_tokens": 764407848.0, + "step": 2657 + }, + { + "epoch": 0.9467497773820125, + "grad_norm": 0.6642358899116516, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7488831430673599, + "num_tokens": 764731077.0, + "step": 2658 + }, + { + "epoch": 0.9471059661620659, + "grad_norm": 0.6854233741760254, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7657402157783508, + "num_tokens": 765013434.0, + "step": 2659 + }, + { + "epoch": 0.9474621549421193, + "grad_norm": 0.7378581762313843, + "learning_rate": 1e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7580728232860565, + "num_tokens": 765250306.0, + "step": 2660 + }, + { + "epoch": 0.9478183437221728, + "grad_norm": 0.7453266382217407, + "learning_rate": 1e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7525528222322464, + "num_tokens": 765488091.0, + "step": 2661 + }, + { + "epoch": 0.9481745325022262, + "grad_norm": 0.6963403820991516, + "learning_rate": 1e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7588471472263336, + "num_tokens": 765772293.0, + "step": 2662 + }, + { + "epoch": 0.9485307212822796, + "grad_norm": 0.6266657114028931, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7636743634939194, + "num_tokens": 766083395.0, + "step": 2663 + }, + { + "epoch": 0.948886910062333, + "grad_norm": 0.6796392202377319, + "learning_rate": 1e-06, + "loss": 0.7821, + "mean_token_accuracy": 0.7588692605495453, + "num_tokens": 766367288.0, + "step": 2664 + }, + { + "epoch": 0.9492430988423864, + "grad_norm": 0.6541641354560852, + "learning_rate": 1e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7617029845714569, + "num_tokens": 766659357.0, + "step": 2665 + }, + { + "epoch": 0.94959928762244, + "grad_norm": 0.7531975507736206, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7688984721899033, + "num_tokens": 766899860.0, + "step": 2666 + }, + { + "epoch": 0.9499554764024933, + "grad_norm": 0.7666388154029846, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7359117269515991, + "num_tokens": 767136025.0, + "step": 2667 + }, + { + "epoch": 0.9503116651825467, + "grad_norm": 0.6886099576950073, + "learning_rate": 1e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7779785394668579, + "num_tokens": 767411548.0, + "step": 2668 + }, + { + "epoch": 0.9506678539626002, + "grad_norm": 0.6758452653884888, + "learning_rate": 1e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.7531581670045853, + "num_tokens": 767685565.0, + "step": 2669 + }, + { + "epoch": 0.9510240427426536, + "grad_norm": 0.6374673247337341, + "learning_rate": 1e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.772906944155693, + "num_tokens": 768008976.0, + "step": 2670 + }, + { + "epoch": 0.951380231522707, + "grad_norm": 0.703177809715271, + "learning_rate": 1e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7520349770784378, + "num_tokens": 768290531.0, + "step": 2671 + }, + { + "epoch": 0.9517364203027605, + "grad_norm": 0.6754008531570435, + "learning_rate": 1e-06, + "loss": 0.78, + "mean_token_accuracy": 0.753854900598526, + "num_tokens": 768577151.0, + "step": 2672 + }, + { + "epoch": 0.9520926090828139, + "grad_norm": 0.6767584085464478, + "learning_rate": 1e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7664849609136581, + "num_tokens": 768858133.0, + "step": 2673 + }, + { + "epoch": 0.9524487978628673, + "grad_norm": 0.6864094734191895, + "learning_rate": 1e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.7586161345243454, + "num_tokens": 769157824.0, + "step": 2674 + }, + { + "epoch": 0.9528049866429208, + "grad_norm": 0.6694618463516235, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7693723440170288, + "num_tokens": 769421663.0, + "step": 2675 + }, + { + "epoch": 0.9531611754229742, + "grad_norm": 0.722935676574707, + "learning_rate": 1e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.751282349228859, + "num_tokens": 769671676.0, + "step": 2676 + }, + { + "epoch": 0.9535173642030276, + "grad_norm": 0.6899527311325073, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7385049909353256, + "num_tokens": 769950448.0, + "step": 2677 + }, + { + "epoch": 0.9538735529830811, + "grad_norm": 0.6717754602432251, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7637040466070175, + "num_tokens": 770237718.0, + "step": 2678 + }, + { + "epoch": 0.9542297417631345, + "grad_norm": 0.6758301854133606, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7611095607280731, + "num_tokens": 770558529.0, + "step": 2679 + }, + { + "epoch": 0.9545859305431879, + "grad_norm": 0.6872066855430603, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7553593963384628, + "num_tokens": 770813608.0, + "step": 2680 + }, + { + "epoch": 0.9549421193232414, + "grad_norm": 0.6929913759231567, + "learning_rate": 1e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7615834474563599, + "num_tokens": 771092292.0, + "step": 2681 + }, + { + "epoch": 0.9552983081032947, + "grad_norm": 0.7271574139595032, + "learning_rate": 1e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7718441188335419, + "num_tokens": 771368787.0, + "step": 2682 + }, + { + "epoch": 0.9556544968833481, + "grad_norm": 0.6852301359176636, + "learning_rate": 1e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7582962363958359, + "num_tokens": 771673840.0, + "step": 2683 + }, + { + "epoch": 0.9560106856634016, + "grad_norm": 0.6765145063400269, + "learning_rate": 1e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7627155929803848, + "num_tokens": 771973032.0, + "step": 2684 + }, + { + "epoch": 0.956366874443455, + "grad_norm": 0.6998546123504639, + "learning_rate": 1e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7660117298364639, + "num_tokens": 772261901.0, + "step": 2685 + }, + { + "epoch": 0.9567230632235084, + "grad_norm": 0.7209973931312561, + "learning_rate": 1e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7674117833375931, + "num_tokens": 772538341.0, + "step": 2686 + }, + { + "epoch": 0.9570792520035619, + "grad_norm": 0.6638671159744263, + "learning_rate": 1e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7542496770620346, + "num_tokens": 772848041.0, + "step": 2687 + }, + { + "epoch": 0.9574354407836153, + "grad_norm": 0.6289968490600586, + "learning_rate": 1e-06, + "loss": 0.72, + "mean_token_accuracy": 0.7798883616924286, + "num_tokens": 773161230.0, + "step": 2688 + }, + { + "epoch": 0.9577916295636687, + "grad_norm": 0.6647046208381653, + "learning_rate": 1e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.7487816959619522, + "num_tokens": 773463483.0, + "step": 2689 + }, + { + "epoch": 0.9581478183437222, + "grad_norm": 0.6450371146202087, + "learning_rate": 1e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.7778222411870956, + "num_tokens": 773763091.0, + "step": 2690 + }, + { + "epoch": 0.9585040071237756, + "grad_norm": 0.6774519681930542, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.765495628118515, + "num_tokens": 774052285.0, + "step": 2691 + }, + { + "epoch": 0.958860195903829, + "grad_norm": 0.6947343349456787, + "learning_rate": 1e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7543844729661942, + "num_tokens": 774330320.0, + "step": 2692 + }, + { + "epoch": 0.9592163846838825, + "grad_norm": 0.67031329870224, + "learning_rate": 1e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7633910179138184, + "num_tokens": 774639175.0, + "step": 2693 + }, + { + "epoch": 0.9595725734639359, + "grad_norm": 0.6877710223197937, + "learning_rate": 1e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7710807919502258, + "num_tokens": 774893805.0, + "step": 2694 + }, + { + "epoch": 0.9599287622439893, + "grad_norm": 0.6527031064033508, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7745619863271713, + "num_tokens": 775192165.0, + "step": 2695 + }, + { + "epoch": 0.9602849510240428, + "grad_norm": 0.6187237501144409, + "learning_rate": 1e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.7739393264055252, + "num_tokens": 775507731.0, + "step": 2696 + }, + { + "epoch": 0.9606411398040962, + "grad_norm": 0.6696570515632629, + "learning_rate": 1e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7569530010223389, + "num_tokens": 775791489.0, + "step": 2697 + }, + { + "epoch": 0.9609973285841495, + "grad_norm": 0.7611795663833618, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7640378326177597, + "num_tokens": 776048125.0, + "step": 2698 + }, + { + "epoch": 0.961353517364203, + "grad_norm": 0.6565390825271606, + "learning_rate": 1e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.747205451130867, + "num_tokens": 776346845.0, + "step": 2699 + }, + { + "epoch": 0.9617097061442564, + "grad_norm": 0.7778176665306091, + "learning_rate": 1e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.7584524601697922, + "num_tokens": 776566037.0, + "step": 2700 + }, + { + "epoch": 0.9620658949243099, + "grad_norm": 0.6749053597450256, + "learning_rate": 1e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7438975274562836, + "num_tokens": 776855613.0, + "step": 2701 + }, + { + "epoch": 0.9624220837043633, + "grad_norm": 0.6759056448936462, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.774381086230278, + "num_tokens": 777147319.0, + "step": 2702 + }, + { + "epoch": 0.9627782724844167, + "grad_norm": 0.6867930889129639, + "learning_rate": 1e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7522396296262741, + "num_tokens": 777426368.0, + "step": 2703 + }, + { + "epoch": 0.9631344612644702, + "grad_norm": 0.6469715237617493, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7649188935756683, + "num_tokens": 777704334.0, + "step": 2704 + }, + { + "epoch": 0.9634906500445236, + "grad_norm": 0.6902132034301758, + "learning_rate": 1e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7540173977613449, + "num_tokens": 777974713.0, + "step": 2705 + }, + { + "epoch": 0.963846838824577, + "grad_norm": 0.6571199297904968, + "learning_rate": 1e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7743148803710938, + "num_tokens": 778259634.0, + "step": 2706 + }, + { + "epoch": 0.9642030276046305, + "grad_norm": 0.6697283983230591, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7430215775966644, + "num_tokens": 778556640.0, + "step": 2707 + }, + { + "epoch": 0.9645592163846839, + "grad_norm": 0.6568761467933655, + "learning_rate": 1e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7653451412916183, + "num_tokens": 778863420.0, + "step": 2708 + }, + { + "epoch": 0.9649154051647373, + "grad_norm": 0.6718494296073914, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7579880356788635, + "num_tokens": 779153814.0, + "step": 2709 + }, + { + "epoch": 0.9652715939447908, + "grad_norm": 0.7040523290634155, + "learning_rate": 1e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7625411003828049, + "num_tokens": 779410598.0, + "step": 2710 + }, + { + "epoch": 0.9656277827248442, + "grad_norm": 0.7124570608139038, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7553056925535202, + "num_tokens": 779664126.0, + "step": 2711 + }, + { + "epoch": 0.9659839715048976, + "grad_norm": 0.6811936497688293, + "learning_rate": 1e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7691504657268524, + "num_tokens": 779942551.0, + "step": 2712 + }, + { + "epoch": 0.9663401602849511, + "grad_norm": 0.6647777557373047, + "learning_rate": 1e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7611490190029144, + "num_tokens": 780228383.0, + "step": 2713 + }, + { + "epoch": 0.9666963490650045, + "grad_norm": 0.6620580554008484, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7719505280256271, + "num_tokens": 780531278.0, + "step": 2714 + }, + { + "epoch": 0.9670525378450578, + "grad_norm": 0.6890320181846619, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7656739503145218, + "num_tokens": 780798596.0, + "step": 2715 + }, + { + "epoch": 0.9674087266251113, + "grad_norm": 0.6715032458305359, + "learning_rate": 1e-06, + "loss": 0.6743, + "mean_token_accuracy": 0.780614510178566, + "num_tokens": 781075359.0, + "step": 2716 + }, + { + "epoch": 0.9677649154051647, + "grad_norm": 0.6743516325950623, + "learning_rate": 1e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7663257569074631, + "num_tokens": 781348191.0, + "step": 2717 + }, + { + "epoch": 0.9681211041852181, + "grad_norm": 0.7231532335281372, + "learning_rate": 1e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7563924491405487, + "num_tokens": 781627191.0, + "step": 2718 + }, + { + "epoch": 0.9684772929652716, + "grad_norm": 0.6427193284034729, + "learning_rate": 1e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7718208581209183, + "num_tokens": 781918388.0, + "step": 2719 + }, + { + "epoch": 0.968833481745325, + "grad_norm": 0.6653052568435669, + "learning_rate": 1e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7718464732170105, + "num_tokens": 782206997.0, + "step": 2720 + }, + { + "epoch": 0.9691896705253784, + "grad_norm": 0.7083798050880432, + "learning_rate": 1e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7645457088947296, + "num_tokens": 782470873.0, + "step": 2721 + }, + { + "epoch": 0.9695458593054319, + "grad_norm": 0.6264573931694031, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.761421874165535, + "num_tokens": 782799816.0, + "step": 2722 + }, + { + "epoch": 0.9699020480854853, + "grad_norm": 0.6941909790039062, + "learning_rate": 1e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7566730827093124, + "num_tokens": 783081554.0, + "step": 2723 + }, + { + "epoch": 0.9702582368655387, + "grad_norm": 0.7221807241439819, + "learning_rate": 1e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7704295814037323, + "num_tokens": 783367609.0, + "step": 2724 + }, + { + "epoch": 0.9706144256455922, + "grad_norm": 0.6385066509246826, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.76601842045784, + "num_tokens": 783675313.0, + "step": 2725 + }, + { + "epoch": 0.9709706144256456, + "grad_norm": 0.6654444336891174, + "learning_rate": 1e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.7787407487630844, + "num_tokens": 783973039.0, + "step": 2726 + }, + { + "epoch": 0.971326803205699, + "grad_norm": 0.6863592863082886, + "learning_rate": 1e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7700602263212204, + "num_tokens": 784280184.0, + "step": 2727 + }, + { + "epoch": 0.9716829919857525, + "grad_norm": 0.6972349286079407, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7473190575838089, + "num_tokens": 784551862.0, + "step": 2728 + }, + { + "epoch": 0.9720391807658059, + "grad_norm": 0.6890872120857239, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7526633441448212, + "num_tokens": 784833820.0, + "step": 2729 + }, + { + "epoch": 0.9723953695458593, + "grad_norm": 0.6826252937316895, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7659453302621841, + "num_tokens": 785127362.0, + "step": 2730 + }, + { + "epoch": 0.9727515583259128, + "grad_norm": 0.6882370710372925, + "learning_rate": 1e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7589222341775894, + "num_tokens": 785425486.0, + "step": 2731 + }, + { + "epoch": 0.9731077471059661, + "grad_norm": 0.7088475823402405, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.76336570084095, + "num_tokens": 785713276.0, + "step": 2732 + }, + { + "epoch": 0.9734639358860195, + "grad_norm": 0.7251241207122803, + "learning_rate": 1e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7711261957883835, + "num_tokens": 785995990.0, + "step": 2733 + }, + { + "epoch": 0.973820124666073, + "grad_norm": 0.6639226078987122, + "learning_rate": 1e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.7513699531555176, + "num_tokens": 786279228.0, + "step": 2734 + }, + { + "epoch": 0.9741763134461264, + "grad_norm": 0.6789643168449402, + "learning_rate": 1e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7637166380882263, + "num_tokens": 786555306.0, + "step": 2735 + }, + { + "epoch": 0.9745325022261799, + "grad_norm": 0.6975098252296448, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7673785835504532, + "num_tokens": 786846838.0, + "step": 2736 + }, + { + "epoch": 0.9748886910062333, + "grad_norm": 0.7095040082931519, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7433652579784393, + "num_tokens": 787130692.0, + "step": 2737 + }, + { + "epoch": 0.9752448797862867, + "grad_norm": 0.6907054781913757, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7614781558513641, + "num_tokens": 787412959.0, + "step": 2738 + }, + { + "epoch": 0.9756010685663402, + "grad_norm": 0.6558875441551208, + "learning_rate": 1e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7796699851751328, + "num_tokens": 787703913.0, + "step": 2739 + }, + { + "epoch": 0.9759572573463936, + "grad_norm": 0.6756333708763123, + "learning_rate": 1e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7659257054328918, + "num_tokens": 787987170.0, + "step": 2740 + }, + { + "epoch": 0.976313446126447, + "grad_norm": 0.6697415709495544, + "learning_rate": 1e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7679530084133148, + "num_tokens": 788268249.0, + "step": 2741 + }, + { + "epoch": 0.9766696349065005, + "grad_norm": 0.6685581803321838, + "learning_rate": 1e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7594376355409622, + "num_tokens": 788553258.0, + "step": 2742 + }, + { + "epoch": 0.9770258236865539, + "grad_norm": 0.6917584538459778, + "learning_rate": 1e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7614437937736511, + "num_tokens": 788830033.0, + "step": 2743 + }, + { + "epoch": 0.9773820124666073, + "grad_norm": 0.6230137944221497, + "learning_rate": 1e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7588483095169067, + "num_tokens": 789170383.0, + "step": 2744 + }, + { + "epoch": 0.9777382012466608, + "grad_norm": 0.6567584276199341, + "learning_rate": 1e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7483427226543427, + "num_tokens": 789479801.0, + "step": 2745 + }, + { + "epoch": 0.9780943900267142, + "grad_norm": 0.6655515432357788, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7503378093242645, + "num_tokens": 789776432.0, + "step": 2746 + }, + { + "epoch": 0.9784505788067676, + "grad_norm": 0.6817856431007385, + "learning_rate": 1e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7573472559452057, + "num_tokens": 790056477.0, + "step": 2747 + }, + { + "epoch": 0.9788067675868211, + "grad_norm": 0.7292168140411377, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7638347893953323, + "num_tokens": 790301593.0, + "step": 2748 + }, + { + "epoch": 0.9791629563668744, + "grad_norm": 0.736605703830719, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7560785561800003, + "num_tokens": 790589616.0, + "step": 2749 + }, + { + "epoch": 0.9795191451469278, + "grad_norm": 0.7371375560760498, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7645726650953293, + "num_tokens": 790864638.0, + "step": 2750 + }, + { + "epoch": 0.9798753339269813, + "grad_norm": 0.6586203575134277, + "learning_rate": 1e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7616358250379562, + "num_tokens": 791172805.0, + "step": 2751 + }, + { + "epoch": 0.9802315227070347, + "grad_norm": 0.728356122970581, + "learning_rate": 1e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.755391463637352, + "num_tokens": 791454460.0, + "step": 2752 + }, + { + "epoch": 0.9805877114870881, + "grad_norm": 0.7513905763626099, + "learning_rate": 1e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7622164785861969, + "num_tokens": 791694804.0, + "step": 2753 + }, + { + "epoch": 0.9809439002671416, + "grad_norm": 0.7082616090774536, + "learning_rate": 1e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7596126794815063, + "num_tokens": 791976776.0, + "step": 2754 + }, + { + "epoch": 0.981300089047195, + "grad_norm": 0.6486513614654541, + "learning_rate": 1e-06, + "loss": 0.681, + "mean_token_accuracy": 0.783649742603302, + "num_tokens": 792253333.0, + "step": 2755 + }, + { + "epoch": 0.9816562778272484, + "grad_norm": 0.6401196718215942, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7619394659996033, + "num_tokens": 792568873.0, + "step": 2756 + }, + { + "epoch": 0.9820124666073019, + "grad_norm": 0.6719501614570618, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7592524588108063, + "num_tokens": 792881358.0, + "step": 2757 + }, + { + "epoch": 0.9823686553873553, + "grad_norm": 0.7099676728248596, + "learning_rate": 1e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7447432577610016, + "num_tokens": 793173055.0, + "step": 2758 + }, + { + "epoch": 0.9827248441674087, + "grad_norm": 0.6679719686508179, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.75095134973526, + "num_tokens": 793483566.0, + "step": 2759 + }, + { + "epoch": 0.9830810329474622, + "grad_norm": 0.6780906319618225, + "learning_rate": 1e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7684450745582581, + "num_tokens": 793766949.0, + "step": 2760 + }, + { + "epoch": 0.9834372217275156, + "grad_norm": 0.6672565937042236, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7641419917345047, + "num_tokens": 794078460.0, + "step": 2761 + }, + { + "epoch": 0.983793410507569, + "grad_norm": 0.7030552625656128, + "learning_rate": 1e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7499317228794098, + "num_tokens": 794362241.0, + "step": 2762 + }, + { + "epoch": 0.9841495992876225, + "grad_norm": 0.6717263460159302, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7670758813619614, + "num_tokens": 794666325.0, + "step": 2763 + }, + { + "epoch": 0.9845057880676759, + "grad_norm": 0.7183770537376404, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7580846846103668, + "num_tokens": 794925700.0, + "step": 2764 + }, + { + "epoch": 0.9848619768477292, + "grad_norm": 0.688250720500946, + "learning_rate": 1e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7676670998334885, + "num_tokens": 795196203.0, + "step": 2765 + }, + { + "epoch": 0.9852181656277827, + "grad_norm": 0.6875635981559753, + "learning_rate": 1e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7483802139759064, + "num_tokens": 795479487.0, + "step": 2766 + }, + { + "epoch": 0.9855743544078361, + "grad_norm": 0.6549299955368042, + "learning_rate": 1e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7765454798936844, + "num_tokens": 795793237.0, + "step": 2767 + }, + { + "epoch": 0.9859305431878895, + "grad_norm": 0.6858444809913635, + "learning_rate": 1e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.760837659239769, + "num_tokens": 796079062.0, + "step": 2768 + }, + { + "epoch": 0.986286731967943, + "grad_norm": 0.6418175101280212, + "learning_rate": 1e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7669044733047485, + "num_tokens": 796393858.0, + "step": 2769 + }, + { + "epoch": 0.9866429207479964, + "grad_norm": 0.7154314517974854, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7537399679422379, + "num_tokens": 796672795.0, + "step": 2770 + }, + { + "epoch": 0.9869991095280498, + "grad_norm": 0.6606261730194092, + "learning_rate": 1e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.763374000787735, + "num_tokens": 796981774.0, + "step": 2771 + }, + { + "epoch": 0.9873552983081033, + "grad_norm": 0.6860007643699646, + "learning_rate": 1e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7621530890464783, + "num_tokens": 797283002.0, + "step": 2772 + }, + { + "epoch": 0.9877114870881567, + "grad_norm": 0.6832521557807922, + "learning_rate": 1e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7428030967712402, + "num_tokens": 797559170.0, + "step": 2773 + }, + { + "epoch": 0.9880676758682102, + "grad_norm": 0.6752592921257019, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.746315211057663, + "num_tokens": 797854730.0, + "step": 2774 + }, + { + "epoch": 0.9884238646482636, + "grad_norm": 0.7222683429718018, + "learning_rate": 1e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7717399448156357, + "num_tokens": 798128876.0, + "step": 2775 + }, + { + "epoch": 0.988780053428317, + "grad_norm": 0.6948894262313843, + "learning_rate": 1e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.755454495549202, + "num_tokens": 798407876.0, + "step": 2776 + }, + { + "epoch": 0.9891362422083705, + "grad_norm": 0.6961714029312134, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7566887438297272, + "num_tokens": 798673671.0, + "step": 2777 + }, + { + "epoch": 0.9894924309884239, + "grad_norm": 0.6885727643966675, + "learning_rate": 1e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.7517627328634262, + "num_tokens": 798952252.0, + "step": 2778 + }, + { + "epoch": 0.9898486197684773, + "grad_norm": 0.6751962304115295, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7644850611686707, + "num_tokens": 799245921.0, + "step": 2779 + }, + { + "epoch": 0.9902048085485308, + "grad_norm": 0.7316889762878418, + "learning_rate": 1e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.762976199388504, + "num_tokens": 799495241.0, + "step": 2780 + }, + { + "epoch": 0.9905609973285842, + "grad_norm": 0.6502657532691956, + "learning_rate": 1e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7513080686330795, + "num_tokens": 799805796.0, + "step": 2781 + }, + { + "epoch": 0.9909171861086375, + "grad_norm": 0.6681564450263977, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7589562982320786, + "num_tokens": 800120053.0, + "step": 2782 + }, + { + "epoch": 0.991273374888691, + "grad_norm": 0.6835052967071533, + "learning_rate": 1e-06, + "loss": 0.67, + "mean_token_accuracy": 0.7871560305356979, + "num_tokens": 800391916.0, + "step": 2783 + }, + { + "epoch": 0.9916295636687444, + "grad_norm": 0.6630085110664368, + "learning_rate": 1e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7773606032133102, + "num_tokens": 800684143.0, + "step": 2784 + }, + { + "epoch": 0.9919857524487978, + "grad_norm": 0.6778810620307922, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7653923630714417, + "num_tokens": 800979705.0, + "step": 2785 + }, + { + "epoch": 0.9923419412288513, + "grad_norm": 0.6852832436561584, + "learning_rate": 1e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7721332162618637, + "num_tokens": 801254809.0, + "step": 2786 + }, + { + "epoch": 0.9926981300089047, + "grad_norm": 0.6729893684387207, + "learning_rate": 1e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7640455514192581, + "num_tokens": 801563720.0, + "step": 2787 + }, + { + "epoch": 0.9930543187889581, + "grad_norm": 0.6856079697608948, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7692680507898331, + "num_tokens": 801828306.0, + "step": 2788 + }, + { + "epoch": 0.9934105075690116, + "grad_norm": 0.6837369799613953, + "learning_rate": 1e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7563987523317337, + "num_tokens": 802116601.0, + "step": 2789 + }, + { + "epoch": 0.993766696349065, + "grad_norm": 0.7136049270629883, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7596304416656494, + "num_tokens": 802370453.0, + "step": 2790 + }, + { + "epoch": 0.9941228851291184, + "grad_norm": 0.6660546660423279, + "learning_rate": 1e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7623187303543091, + "num_tokens": 802645342.0, + "step": 2791 + }, + { + "epoch": 0.9944790739091719, + "grad_norm": 0.6418343782424927, + "learning_rate": 1e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7680895775556564, + "num_tokens": 802954677.0, + "step": 2792 + }, + { + "epoch": 0.9948352626892253, + "grad_norm": 0.6531106233596802, + "learning_rate": 1e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7752409428358078, + "num_tokens": 803262531.0, + "step": 2793 + }, + { + "epoch": 0.9951914514692787, + "grad_norm": 0.6257500648498535, + "learning_rate": 1e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.780244454741478, + "num_tokens": 803582691.0, + "step": 2794 + }, + { + "epoch": 0.9955476402493322, + "grad_norm": 0.6768285632133484, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7303447276353836, + "num_tokens": 803874923.0, + "step": 2795 + }, + { + "epoch": 0.9959038290293856, + "grad_norm": 0.6722902059555054, + "learning_rate": 1e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.7470046877861023, + "num_tokens": 804182405.0, + "step": 2796 + }, + { + "epoch": 0.996260017809439, + "grad_norm": 0.6815645098686218, + "learning_rate": 1e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7503835409879684, + "num_tokens": 804471592.0, + "step": 2797 + }, + { + "epoch": 0.9966162065894925, + "grad_norm": 0.6946245431900024, + "learning_rate": 1e-06, + "loss": 0.6729, + "mean_token_accuracy": 0.7798652797937393, + "num_tokens": 804752032.0, + "step": 2798 + }, + { + "epoch": 0.9969723953695458, + "grad_norm": 0.7058485746383667, + "learning_rate": 1e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7626858651638031, + "num_tokens": 805017404.0, + "step": 2799 + }, + { + "epoch": 0.9973285841495992, + "grad_norm": 0.7076333165168762, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7561352103948593, + "num_tokens": 805278111.0, + "step": 2800 + }, + { + "epoch": 0.9976847729296527, + "grad_norm": 0.6929389238357544, + "learning_rate": 1e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7624682933092117, + "num_tokens": 805531508.0, + "step": 2801 + }, + { + "epoch": 0.9980409617097061, + "grad_norm": 0.6629091501235962, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7709852457046509, + "num_tokens": 805832100.0, + "step": 2802 + }, + { + "epoch": 0.9983971504897595, + "grad_norm": 0.6742632389068604, + "learning_rate": 1e-06, + "loss": 0.781, + "mean_token_accuracy": 0.758186474442482, + "num_tokens": 806124410.0, + "step": 2803 + }, + { + "epoch": 0.998753339269813, + "grad_norm": 0.6678634285926819, + "learning_rate": 1e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7662483900785446, + "num_tokens": 806410715.0, + "step": 2804 + }, + { + "epoch": 0.9991095280498664, + "grad_norm": 0.6720565557479858, + "learning_rate": 1e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7518336474895477, + "num_tokens": 806702349.0, + "step": 2805 + }, + { + "epoch": 0.9994657168299198, + "grad_norm": 0.6856051683425903, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7623118609189987, + "num_tokens": 806971704.0, + "step": 2806 + }, + { + "epoch": 0.9998219056099733, + "grad_norm": 0.7099072337150574, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7421988397836685, + "num_tokens": 807227347.0, + "step": 2807 + }, + { + "epoch": 1.0, + "grad_norm": 0.7099072337150574, + "learning_rate": 1e-06, + "loss": 0.6818, + "mean_token_accuracy": 0.7836036384105682, + "num_tokens": 807393124.0, + "step": 2808 + }, + { + "epoch": 1.0003561887800534, + "grad_norm": 0.41707083582878113, + "learning_rate": 1e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7564393132925034, + "num_tokens": 807686311.0, + "step": 2809 + }, + { + "epoch": 1.0007123775601068, + "grad_norm": 0.41873422265052795, + "learning_rate": 1e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7635955959558487, + "num_tokens": 808023204.0, + "step": 2810 + }, + { + "epoch": 1.0010685663401604, + "grad_norm": 0.436829149723053, + "learning_rate": 1e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7747940272092819, + "num_tokens": 808368301.0, + "step": 2811 + }, + { + "epoch": 1.0014247551202138, + "grad_norm": 0.4824158847332001, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7453280091285706, + "num_tokens": 808660052.0, + "step": 2812 + }, + { + "epoch": 1.0017809439002672, + "grad_norm": 0.44294044375419617, + "learning_rate": 1e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7731113880872726, + "num_tokens": 808962938.0, + "step": 2813 + }, + { + "epoch": 1.0021371326803206, + "grad_norm": 0.4331691861152649, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7452012002468109, + "num_tokens": 809231538.0, + "step": 2814 + }, + { + "epoch": 1.002493321460374, + "grad_norm": 0.5225152969360352, + "learning_rate": 1e-06, + "loss": 0.7631, + "mean_token_accuracy": 0.7545275390148163, + "num_tokens": 809501164.0, + "step": 2815 + }, + { + "epoch": 1.0028495102404273, + "grad_norm": 0.47111132740974426, + "learning_rate": 1e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7685803771018982, + "num_tokens": 809801636.0, + "step": 2816 + }, + { + "epoch": 1.003205699020481, + "grad_norm": 0.46769168972969055, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7626847922801971, + "num_tokens": 810095261.0, + "step": 2817 + }, + { + "epoch": 1.0035618878005343, + "grad_norm": 0.47437071800231934, + "learning_rate": 1e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7554489970207214, + "num_tokens": 810379217.0, + "step": 2818 + }, + { + "epoch": 1.0039180765805877, + "grad_norm": 0.5132865309715271, + "learning_rate": 1e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7695056647062302, + "num_tokens": 810655527.0, + "step": 2819 + }, + { + "epoch": 1.0042742653606411, + "grad_norm": 0.43935179710388184, + "learning_rate": 1e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7709453105926514, + "num_tokens": 810933576.0, + "step": 2820 + }, + { + "epoch": 1.0046304541406945, + "grad_norm": 0.4534376561641693, + "learning_rate": 1e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7746105492115021, + "num_tokens": 811214622.0, + "step": 2821 + }, + { + "epoch": 1.004986642920748, + "grad_norm": 0.5057229995727539, + "learning_rate": 1e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7608812749385834, + "num_tokens": 811502515.0, + "step": 2822 + }, + { + "epoch": 1.0053428317008015, + "grad_norm": 0.5006139278411865, + "learning_rate": 1e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7606966942548752, + "num_tokens": 811814011.0, + "step": 2823 + }, + { + "epoch": 1.005699020480855, + "grad_norm": 0.42097926139831543, + "learning_rate": 1e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.765003502368927, + "num_tokens": 812108731.0, + "step": 2824 + }, + { + "epoch": 1.0060552092609083, + "grad_norm": 0.5318528413772583, + "learning_rate": 1e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7492406964302063, + "num_tokens": 812366348.0, + "step": 2825 + }, + { + "epoch": 1.0064113980409617, + "grad_norm": 0.483951210975647, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7453905940055847, + "num_tokens": 812649394.0, + "step": 2826 + }, + { + "epoch": 1.006767586821015, + "grad_norm": 0.42962032556533813, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7325327545404434, + "num_tokens": 812936805.0, + "step": 2827 + }, + { + "epoch": 1.0071237756010685, + "grad_norm": 0.5090049505233765, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7549055367708206, + "num_tokens": 813192700.0, + "step": 2828 + }, + { + "epoch": 1.007479964381122, + "grad_norm": 0.49182072281837463, + "learning_rate": 1e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7484990656375885, + "num_tokens": 813464074.0, + "step": 2829 + }, + { + "epoch": 1.0078361531611755, + "grad_norm": 0.49815234541893005, + "learning_rate": 1e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.7550304234027863, + "num_tokens": 813755144.0, + "step": 2830 + }, + { + "epoch": 1.0081923419412289, + "grad_norm": 0.4673725664615631, + "learning_rate": 1e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7698882967233658, + "num_tokens": 814023551.0, + "step": 2831 + }, + { + "epoch": 1.0085485307212823, + "grad_norm": 0.4869326055049896, + "learning_rate": 1e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7744113206863403, + "num_tokens": 814320476.0, + "step": 2832 + }, + { + "epoch": 1.0089047195013356, + "grad_norm": 0.4484611749649048, + "learning_rate": 1e-06, + "loss": 0.6763, + "mean_token_accuracy": 0.7861098498106003, + "num_tokens": 814601459.0, + "step": 2833 + }, + { + "epoch": 1.009260908281389, + "grad_norm": 0.48757869005203247, + "learning_rate": 1e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7472775280475616, + "num_tokens": 814877718.0, + "step": 2834 + }, + { + "epoch": 1.0096170970614426, + "grad_norm": 0.5066636800765991, + "learning_rate": 1e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7709704041481018, + "num_tokens": 815165671.0, + "step": 2835 + }, + { + "epoch": 1.009973285841496, + "grad_norm": 0.5066519379615784, + "learning_rate": 1e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7622701525688171, + "num_tokens": 815418082.0, + "step": 2836 + }, + { + "epoch": 1.0103294746215494, + "grad_norm": 0.42476940155029297, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7602419257164001, + "num_tokens": 815724458.0, + "step": 2837 + }, + { + "epoch": 1.0106856634016028, + "grad_norm": 0.4453437626361847, + "learning_rate": 1e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7556560337543488, + "num_tokens": 816012097.0, + "step": 2838 + }, + { + "epoch": 1.0110418521816562, + "grad_norm": 0.4765700399875641, + "learning_rate": 1e-06, + "loss": 0.7974, + "mean_token_accuracy": 0.7506545931100845, + "num_tokens": 816341383.0, + "step": 2839 + }, + { + "epoch": 1.0113980409617096, + "grad_norm": 0.47631487250328064, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7796834260225296, + "num_tokens": 816613575.0, + "step": 2840 + }, + { + "epoch": 1.0117542297417632, + "grad_norm": 0.5100927948951721, + "learning_rate": 1e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7620970010757446, + "num_tokens": 816875684.0, + "step": 2841 + }, + { + "epoch": 1.0121104185218166, + "grad_norm": 0.5093263983726501, + "learning_rate": 1e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7546339184045792, + "num_tokens": 817156529.0, + "step": 2842 + }, + { + "epoch": 1.01246660730187, + "grad_norm": 0.4757498502731323, + "learning_rate": 1e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7615365535020828, + "num_tokens": 817442108.0, + "step": 2843 + }, + { + "epoch": 1.0128227960819234, + "grad_norm": 0.464030385017395, + "learning_rate": 1e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7643335610628128, + "num_tokens": 817748985.0, + "step": 2844 + }, + { + "epoch": 1.0131789848619768, + "grad_norm": 0.4866922199726105, + "learning_rate": 1e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.758593276143074, + "num_tokens": 818043802.0, + "step": 2845 + }, + { + "epoch": 1.0135351736420304, + "grad_norm": 0.504433274269104, + "learning_rate": 1e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7712543904781342, + "num_tokens": 818350847.0, + "step": 2846 + }, + { + "epoch": 1.0138913624220838, + "grad_norm": 0.47129613161087036, + "learning_rate": 1e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7627162039279938, + "num_tokens": 818641873.0, + "step": 2847 + }, + { + "epoch": 1.0142475512021372, + "grad_norm": 0.477424293756485, + "learning_rate": 1e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7716263085603714, + "num_tokens": 818913924.0, + "step": 2848 + }, + { + "epoch": 1.0146037399821906, + "grad_norm": 0.4944600462913513, + "learning_rate": 1e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7717239707708359, + "num_tokens": 819180353.0, + "step": 2849 + }, + { + "epoch": 1.014959928762244, + "grad_norm": 0.4600664973258972, + "learning_rate": 1e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7739503383636475, + "num_tokens": 819457929.0, + "step": 2850 + }, + { + "epoch": 1.0153161175422973, + "grad_norm": 0.5245296955108643, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7679819911718369, + "num_tokens": 819717286.0, + "step": 2851 + }, + { + "epoch": 1.015672306322351, + "grad_norm": 0.4213576018810272, + "learning_rate": 1e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7673059403896332, + "num_tokens": 820035538.0, + "step": 2852 + }, + { + "epoch": 1.0160284951024043, + "grad_norm": 0.48766693472862244, + "learning_rate": 1e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.767678514122963, + "num_tokens": 820330522.0, + "step": 2853 + }, + { + "epoch": 1.0163846838824577, + "grad_norm": 0.5202435255050659, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7535788416862488, + "num_tokens": 820606437.0, + "step": 2854 + }, + { + "epoch": 1.0167408726625111, + "grad_norm": 0.49013257026672363, + "learning_rate": 1e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7663050591945648, + "num_tokens": 820869790.0, + "step": 2855 + }, + { + "epoch": 1.0170970614425645, + "grad_norm": 0.47246253490448, + "learning_rate": 1e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7746826559305191, + "num_tokens": 821178787.0, + "step": 2856 + }, + { + "epoch": 1.017453250222618, + "grad_norm": 0.43971362709999084, + "learning_rate": 1e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.77941033244133, + "num_tokens": 821485639.0, + "step": 2857 + }, + { + "epoch": 1.0178094390026715, + "grad_norm": 0.4705054759979248, + "learning_rate": 1e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.7744911164045334, + "num_tokens": 821754366.0, + "step": 2858 + }, + { + "epoch": 1.018165627782725, + "grad_norm": 0.5489979982376099, + "learning_rate": 1e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7560720890760422, + "num_tokens": 822021280.0, + "step": 2859 + }, + { + "epoch": 1.0185218165627783, + "grad_norm": 0.48012956976890564, + "learning_rate": 1e-06, + "loss": 0.7079, + "mean_token_accuracy": 0.7710678577423096, + "num_tokens": 822291855.0, + "step": 2860 + }, + { + "epoch": 1.0188780053428317, + "grad_norm": 0.5186047554016113, + "learning_rate": 1e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7513036727905273, + "num_tokens": 822595935.0, + "step": 2861 + }, + { + "epoch": 1.019234194122885, + "grad_norm": 0.4343999922275543, + "learning_rate": 1e-06, + "loss": 0.7065, + "mean_token_accuracy": 0.7702271193265915, + "num_tokens": 822884658.0, + "step": 2862 + }, + { + "epoch": 1.0195903829029385, + "grad_norm": 0.4581238925457001, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7744714468717575, + "num_tokens": 823170386.0, + "step": 2863 + }, + { + "epoch": 1.019946571682992, + "grad_norm": 0.4743413031101227, + "learning_rate": 1e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.7759902328252792, + "num_tokens": 823455284.0, + "step": 2864 + }, + { + "epoch": 1.0203027604630455, + "grad_norm": 0.4962853491306305, + "learning_rate": 1e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7750308066606522, + "num_tokens": 823758655.0, + "step": 2865 + }, + { + "epoch": 1.0206589492430989, + "grad_norm": 0.527279794216156, + "learning_rate": 1e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7718760222196579, + "num_tokens": 824010762.0, + "step": 2866 + }, + { + "epoch": 1.0210151380231522, + "grad_norm": 0.47157225012779236, + "learning_rate": 1e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7669539004564285, + "num_tokens": 824304201.0, + "step": 2867 + }, + { + "epoch": 1.0213713268032056, + "grad_norm": 0.49682092666625977, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7556475102901459, + "num_tokens": 824578093.0, + "step": 2868 + }, + { + "epoch": 1.021727515583259, + "grad_norm": 0.4977516829967499, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7564491778612137, + "num_tokens": 824875218.0, + "step": 2869 + }, + { + "epoch": 1.0220837043633126, + "grad_norm": 0.5112830996513367, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7627158910036087, + "num_tokens": 825145558.0, + "step": 2870 + }, + { + "epoch": 1.022439893143366, + "grad_norm": 0.45276570320129395, + "learning_rate": 1e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.77017442882061, + "num_tokens": 825421498.0, + "step": 2871 + }, + { + "epoch": 1.0227960819234194, + "grad_norm": 0.47764864563941956, + "learning_rate": 1e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.749189019203186, + "num_tokens": 825708611.0, + "step": 2872 + }, + { + "epoch": 1.0231522707034728, + "grad_norm": 0.47407394647598267, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7406521439552307, + "num_tokens": 826007884.0, + "step": 2873 + }, + { + "epoch": 1.0235084594835262, + "grad_norm": 0.44277989864349365, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7483407407999039, + "num_tokens": 826296289.0, + "step": 2874 + }, + { + "epoch": 1.0238646482635796, + "grad_norm": 0.4261217713356018, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7773254215717316, + "num_tokens": 826582728.0, + "step": 2875 + }, + { + "epoch": 1.0242208370436332, + "grad_norm": 0.5384624004364014, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7474214136600494, + "num_tokens": 826826170.0, + "step": 2876 + }, + { + "epoch": 1.0245770258236866, + "grad_norm": 0.48311129212379456, + "learning_rate": 1e-06, + "loss": 0.776, + "mean_token_accuracy": 0.760546013712883, + "num_tokens": 827125825.0, + "step": 2877 + }, + { + "epoch": 1.02493321460374, + "grad_norm": 0.5184524059295654, + "learning_rate": 1e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7630449682474136, + "num_tokens": 827395836.0, + "step": 2878 + }, + { + "epoch": 1.0252894033837934, + "grad_norm": 0.3989325165748596, + "learning_rate": 1e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7659057378768921, + "num_tokens": 827721913.0, + "step": 2879 + }, + { + "epoch": 1.0256455921638468, + "grad_norm": 0.44355493783950806, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7716623991727829, + "num_tokens": 828028456.0, + "step": 2880 + }, + { + "epoch": 1.0260017809439002, + "grad_norm": 0.4321267902851105, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7665754705667496, + "num_tokens": 828336455.0, + "step": 2881 + }, + { + "epoch": 1.0263579697239538, + "grad_norm": 0.5015103816986084, + "learning_rate": 1e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7464237660169601, + "num_tokens": 828595497.0, + "step": 2882 + }, + { + "epoch": 1.0267141585040072, + "grad_norm": 0.49290579557418823, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7687012106180191, + "num_tokens": 828876256.0, + "step": 2883 + }, + { + "epoch": 1.0270703472840605, + "grad_norm": 0.5061460733413696, + "learning_rate": 1e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.7534992098808289, + "num_tokens": 829155107.0, + "step": 2884 + }, + { + "epoch": 1.027426536064114, + "grad_norm": 0.5149397850036621, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7483497262001038, + "num_tokens": 829436467.0, + "step": 2885 + }, + { + "epoch": 1.0277827248441673, + "grad_norm": 0.4795144498348236, + "learning_rate": 1e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7605857402086258, + "num_tokens": 829732630.0, + "step": 2886 + }, + { + "epoch": 1.028138913624221, + "grad_norm": 0.4879697263240814, + "learning_rate": 1e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.770164743065834, + "num_tokens": 830006934.0, + "step": 2887 + }, + { + "epoch": 1.0284951024042743, + "grad_norm": 0.461806058883667, + "learning_rate": 1e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.7894024699926376, + "num_tokens": 830310811.0, + "step": 2888 + }, + { + "epoch": 1.0288512911843277, + "grad_norm": 0.5033734440803528, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7636607438325882, + "num_tokens": 830582960.0, + "step": 2889 + }, + { + "epoch": 1.0292074799643811, + "grad_norm": 0.48201000690460205, + "learning_rate": 1e-06, + "loss": 0.7105, + "mean_token_accuracy": 0.7685332298278809, + "num_tokens": 830826322.0, + "step": 2890 + }, + { + "epoch": 1.0295636687444345, + "grad_norm": 0.48890650272369385, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.738326758146286, + "num_tokens": 831135850.0, + "step": 2891 + }, + { + "epoch": 1.029919857524488, + "grad_norm": 0.47113117575645447, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7553746998310089, + "num_tokens": 831423755.0, + "step": 2892 + }, + { + "epoch": 1.0302760463045415, + "grad_norm": 0.45448026061058044, + "learning_rate": 1e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7691803574562073, + "num_tokens": 831725715.0, + "step": 2893 + }, + { + "epoch": 1.030632235084595, + "grad_norm": 0.5098212361335754, + "learning_rate": 1e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.7609353512525558, + "num_tokens": 832021717.0, + "step": 2894 + }, + { + "epoch": 1.0309884238646483, + "grad_norm": 0.4566635191440582, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7644995450973511, + "num_tokens": 832347613.0, + "step": 2895 + }, + { + "epoch": 1.0313446126447017, + "grad_norm": 0.4893587827682495, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7612462937831879, + "num_tokens": 832638527.0, + "step": 2896 + }, + { + "epoch": 1.031700801424755, + "grad_norm": 0.5077638030052185, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7671360820531845, + "num_tokens": 832909382.0, + "step": 2897 + }, + { + "epoch": 1.0320569902048085, + "grad_norm": 0.4716866612434387, + "learning_rate": 1e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7567261755466461, + "num_tokens": 833184974.0, + "step": 2898 + }, + { + "epoch": 1.032413178984862, + "grad_norm": 0.4645240604877472, + "learning_rate": 1e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7561767846345901, + "num_tokens": 833458097.0, + "step": 2899 + }, + { + "epoch": 1.0327693677649155, + "grad_norm": 0.49443989992141724, + "learning_rate": 1e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7454858273267746, + "num_tokens": 833744022.0, + "step": 2900 + }, + { + "epoch": 1.0331255565449688, + "grad_norm": 0.49640092253685, + "learning_rate": 1e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7555088996887207, + "num_tokens": 834025470.0, + "step": 2901 + }, + { + "epoch": 1.0334817453250222, + "grad_norm": 0.5061529874801636, + "learning_rate": 1e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7537494599819183, + "num_tokens": 834229787.0, + "step": 2902 + }, + { + "epoch": 1.0338379341050756, + "grad_norm": 0.46402508020401, + "learning_rate": 1e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7748181521892548, + "num_tokens": 834530760.0, + "step": 2903 + }, + { + "epoch": 1.034194122885129, + "grad_norm": 0.4965580403804779, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7690098434686661, + "num_tokens": 834795558.0, + "step": 2904 + }, + { + "epoch": 1.0345503116651826, + "grad_norm": 0.4590342342853546, + "learning_rate": 1e-06, + "loss": 0.79, + "mean_token_accuracy": 0.7492911219596863, + "num_tokens": 835098528.0, + "step": 2905 + }, + { + "epoch": 1.034906500445236, + "grad_norm": 0.4638325273990631, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7496176511049271, + "num_tokens": 835405148.0, + "step": 2906 + }, + { + "epoch": 1.0352626892252894, + "grad_norm": 0.5111831426620483, + "learning_rate": 1e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7741676270961761, + "num_tokens": 835679506.0, + "step": 2907 + }, + { + "epoch": 1.0356188780053428, + "grad_norm": 0.5300699472427368, + "learning_rate": 1e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7624465525150299, + "num_tokens": 835938130.0, + "step": 2908 + }, + { + "epoch": 1.0359750667853962, + "grad_norm": 0.4683562219142914, + "learning_rate": 1e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.7762782424688339, + "num_tokens": 836252787.0, + "step": 2909 + }, + { + "epoch": 1.0363312555654496, + "grad_norm": 0.5428068041801453, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7588069438934326, + "num_tokens": 836521880.0, + "step": 2910 + }, + { + "epoch": 1.0366874443455032, + "grad_norm": 0.46030333638191223, + "learning_rate": 1e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7695367485284805, + "num_tokens": 836814922.0, + "step": 2911 + }, + { + "epoch": 1.0370436331255566, + "grad_norm": 0.4392981231212616, + "learning_rate": 1e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7733833491802216, + "num_tokens": 837108651.0, + "step": 2912 + }, + { + "epoch": 1.03739982190561, + "grad_norm": 0.5024381875991821, + "learning_rate": 1e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7692636698484421, + "num_tokens": 837389185.0, + "step": 2913 + }, + { + "epoch": 1.0377560106856634, + "grad_norm": 0.44813597202301025, + "learning_rate": 1e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7694389969110489, + "num_tokens": 837694370.0, + "step": 2914 + }, + { + "epoch": 1.0381121994657168, + "grad_norm": 0.4605119228363037, + "learning_rate": 1e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.7668234556913376, + "num_tokens": 837981092.0, + "step": 2915 + }, + { + "epoch": 1.0384683882457701, + "grad_norm": 0.5143360495567322, + "learning_rate": 1e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7735480070114136, + "num_tokens": 838255487.0, + "step": 2916 + }, + { + "epoch": 1.0388245770258238, + "grad_norm": 0.46057814359664917, + "learning_rate": 1e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7509575635194778, + "num_tokens": 838561170.0, + "step": 2917 + }, + { + "epoch": 1.0391807658058771, + "grad_norm": 0.4597727656364441, + "learning_rate": 1e-06, + "loss": 0.7066, + "mean_token_accuracy": 0.7699762284755707, + "num_tokens": 838870045.0, + "step": 2918 + }, + { + "epoch": 1.0395369545859305, + "grad_norm": 0.4454556703567505, + "learning_rate": 1e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7577897608280182, + "num_tokens": 839166144.0, + "step": 2919 + }, + { + "epoch": 1.039893143365984, + "grad_norm": 0.44899871945381165, + "learning_rate": 1e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7560140639543533, + "num_tokens": 839448566.0, + "step": 2920 + }, + { + "epoch": 1.0402493321460373, + "grad_norm": 0.5259367823600769, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7554109543561935, + "num_tokens": 839703028.0, + "step": 2921 + }, + { + "epoch": 1.040605520926091, + "grad_norm": 0.45316141843795776, + "learning_rate": 1e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7620560675859451, + "num_tokens": 840010962.0, + "step": 2922 + }, + { + "epoch": 1.0409617097061443, + "grad_norm": 0.428474485874176, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7551222145557404, + "num_tokens": 840314551.0, + "step": 2923 + }, + { + "epoch": 1.0413178984861977, + "grad_norm": 0.4939417541027069, + "learning_rate": 1e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7704697549343109, + "num_tokens": 840612830.0, + "step": 2924 + }, + { + "epoch": 1.041674087266251, + "grad_norm": 0.4888269603252411, + "learning_rate": 1e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7710277140140533, + "num_tokens": 840895366.0, + "step": 2925 + }, + { + "epoch": 1.0420302760463045, + "grad_norm": 0.48329800367355347, + "learning_rate": 1e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.7843232601881027, + "num_tokens": 841178830.0, + "step": 2926 + }, + { + "epoch": 1.0423864648263579, + "grad_norm": 0.5167393088340759, + "learning_rate": 1e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.769746333360672, + "num_tokens": 841442284.0, + "step": 2927 + }, + { + "epoch": 1.0427426536064115, + "grad_norm": 0.5311731696128845, + "learning_rate": 1e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7458455264568329, + "num_tokens": 841696709.0, + "step": 2928 + }, + { + "epoch": 1.0430988423864649, + "grad_norm": 0.45325857400894165, + "learning_rate": 1e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.7609257102012634, + "num_tokens": 842035846.0, + "step": 2929 + }, + { + "epoch": 1.0434550311665183, + "grad_norm": 0.4677273631095886, + "learning_rate": 1e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.7586669772863388, + "num_tokens": 842322496.0, + "step": 2930 + }, + { + "epoch": 1.0438112199465717, + "grad_norm": 0.46257635951042175, + "learning_rate": 1e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.7747493833303452, + "num_tokens": 842659485.0, + "step": 2931 + }, + { + "epoch": 1.044167408726625, + "grad_norm": 0.45071858167648315, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7606348395347595, + "num_tokens": 842920324.0, + "step": 2932 + }, + { + "epoch": 1.0445235975066784, + "grad_norm": 0.5174592137336731, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7679284363985062, + "num_tokens": 843175640.0, + "step": 2933 + }, + { + "epoch": 1.044879786286732, + "grad_norm": 0.4588494300842285, + "learning_rate": 1e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7471181899309158, + "num_tokens": 843475501.0, + "step": 2934 + }, + { + "epoch": 1.0452359750667854, + "grad_norm": 0.434702605009079, + "learning_rate": 1e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7689646035432816, + "num_tokens": 843754132.0, + "step": 2935 + }, + { + "epoch": 1.0455921638468388, + "grad_norm": 0.4779587686061859, + "learning_rate": 1e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7615045011043549, + "num_tokens": 844062781.0, + "step": 2936 + }, + { + "epoch": 1.0459483526268922, + "grad_norm": 0.47126853466033936, + "learning_rate": 1e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7735237032175064, + "num_tokens": 844332990.0, + "step": 2937 + }, + { + "epoch": 1.0463045414069456, + "grad_norm": 0.45336639881134033, + "learning_rate": 1e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7752427309751511, + "num_tokens": 844601984.0, + "step": 2938 + }, + { + "epoch": 1.046660730186999, + "grad_norm": 0.5004746913909912, + "learning_rate": 1e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7701052278280258, + "num_tokens": 844879746.0, + "step": 2939 + }, + { + "epoch": 1.0470169189670526, + "grad_norm": 0.5103931427001953, + "learning_rate": 1e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7743247449398041, + "num_tokens": 845148644.0, + "step": 2940 + }, + { + "epoch": 1.047373107747106, + "grad_norm": 0.5179495811462402, + "learning_rate": 1e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7606066465377808, + "num_tokens": 845443972.0, + "step": 2941 + }, + { + "epoch": 1.0477292965271594, + "grad_norm": 0.4540930688381195, + "learning_rate": 1e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7690909653902054, + "num_tokens": 845746914.0, + "step": 2942 + }, + { + "epoch": 1.0480854853072128, + "grad_norm": 0.4511392116546631, + "learning_rate": 1e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7501934617757797, + "num_tokens": 846036752.0, + "step": 2943 + }, + { + "epoch": 1.0484416740872662, + "grad_norm": 0.4840991497039795, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7497910559177399, + "num_tokens": 846336417.0, + "step": 2944 + }, + { + "epoch": 1.0487978628673196, + "grad_norm": 0.48882830142974854, + "learning_rate": 1e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7454710900783539, + "num_tokens": 846598792.0, + "step": 2945 + }, + { + "epoch": 1.0491540516473732, + "grad_norm": 0.4472297430038452, + "learning_rate": 1e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7712157219648361, + "num_tokens": 846910677.0, + "step": 2946 + }, + { + "epoch": 1.0495102404274266, + "grad_norm": 0.4667232632637024, + "learning_rate": 1e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.753622755408287, + "num_tokens": 847261734.0, + "step": 2947 + }, + { + "epoch": 1.04986642920748, + "grad_norm": 0.4494629204273224, + "learning_rate": 1e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7473038136959076, + "num_tokens": 847568298.0, + "step": 2948 + }, + { + "epoch": 1.0502226179875334, + "grad_norm": 0.47186657786369324, + "learning_rate": 1e-06, + "loss": 0.6757, + "mean_token_accuracy": 0.7788310945034027, + "num_tokens": 847846635.0, + "step": 2949 + }, + { + "epoch": 1.0505788067675867, + "grad_norm": 0.4561161398887634, + "learning_rate": 1e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.7700608670711517, + "num_tokens": 848136246.0, + "step": 2950 + }, + { + "epoch": 1.0509349955476401, + "grad_norm": 0.4761880040168762, + "learning_rate": 1e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7513036578893661, + "num_tokens": 848392195.0, + "step": 2951 + }, + { + "epoch": 1.0512911843276938, + "grad_norm": 0.4756295382976532, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7639254331588745, + "num_tokens": 848698532.0, + "step": 2952 + }, + { + "epoch": 1.0516473731077471, + "grad_norm": 0.4107478857040405, + "learning_rate": 1e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.76873479783535, + "num_tokens": 849030274.0, + "step": 2953 + }, + { + "epoch": 1.0520035618878005, + "grad_norm": 0.4991289973258972, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7664169669151306, + "num_tokens": 849263790.0, + "step": 2954 + }, + { + "epoch": 1.052359750667854, + "grad_norm": 0.453635573387146, + "learning_rate": 1e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.773367241024971, + "num_tokens": 849578999.0, + "step": 2955 + }, + { + "epoch": 1.0527159394479073, + "grad_norm": 0.510757327079773, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7538682818412781, + "num_tokens": 849881429.0, + "step": 2956 + }, + { + "epoch": 1.053072128227961, + "grad_norm": 0.49099200963974, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7416313737630844, + "num_tokens": 850139227.0, + "step": 2957 + }, + { + "epoch": 1.0534283170080143, + "grad_norm": 0.5102485418319702, + "learning_rate": 1e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7551239728927612, + "num_tokens": 850406282.0, + "step": 2958 + }, + { + "epoch": 1.0537845057880677, + "grad_norm": 0.5137097239494324, + "learning_rate": 1e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7650684714317322, + "num_tokens": 850691470.0, + "step": 2959 + }, + { + "epoch": 1.054140694568121, + "grad_norm": 0.5042114853858948, + "learning_rate": 1e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7735506445169449, + "num_tokens": 850968522.0, + "step": 2960 + }, + { + "epoch": 1.0544968833481745, + "grad_norm": 0.47200828790664673, + "learning_rate": 1e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7450458854436874, + "num_tokens": 851261194.0, + "step": 2961 + }, + { + "epoch": 1.0548530721282279, + "grad_norm": 0.48224732279777527, + "learning_rate": 1e-06, + "loss": 0.6588, + "mean_token_accuracy": 0.7866308242082596, + "num_tokens": 851558367.0, + "step": 2962 + }, + { + "epoch": 1.0552092609082815, + "grad_norm": 0.46263620257377625, + "learning_rate": 1e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7515275031328201, + "num_tokens": 851855746.0, + "step": 2963 + }, + { + "epoch": 1.0555654496883349, + "grad_norm": 0.4948272407054901, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7521659731864929, + "num_tokens": 852178194.0, + "step": 2964 + }, + { + "epoch": 1.0559216384683883, + "grad_norm": 0.47587844729423523, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7571811378002167, + "num_tokens": 852464468.0, + "step": 2965 + }, + { + "epoch": 1.0562778272484417, + "grad_norm": 0.4768812358379364, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7631020545959473, + "num_tokens": 852750498.0, + "step": 2966 + }, + { + "epoch": 1.056634016028495, + "grad_norm": 0.4909980893135071, + "learning_rate": 1e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.7730706185102463, + "num_tokens": 853015174.0, + "step": 2967 + }, + { + "epoch": 1.0569902048085484, + "grad_norm": 0.47878119349479675, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7751006484031677, + "num_tokens": 853265934.0, + "step": 2968 + }, + { + "epoch": 1.057346393588602, + "grad_norm": 0.48495417833328247, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7695051282644272, + "num_tokens": 853569613.0, + "step": 2969 + }, + { + "epoch": 1.0577025823686554, + "grad_norm": 0.4764358699321747, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7621534168720245, + "num_tokens": 853851433.0, + "step": 2970 + }, + { + "epoch": 1.0580587711487088, + "grad_norm": 0.49077126383781433, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7609757781028748, + "num_tokens": 854169897.0, + "step": 2971 + }, + { + "epoch": 1.0584149599287622, + "grad_norm": 0.4685116708278656, + "learning_rate": 1e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7688997834920883, + "num_tokens": 854432395.0, + "step": 2972 + }, + { + "epoch": 1.0587711487088156, + "grad_norm": 0.4640658497810364, + "learning_rate": 1e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7676800638437271, + "num_tokens": 854743558.0, + "step": 2973 + }, + { + "epoch": 1.059127337488869, + "grad_norm": 0.4899391829967499, + "learning_rate": 1e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7622034996747971, + "num_tokens": 855017487.0, + "step": 2974 + }, + { + "epoch": 1.0594835262689226, + "grad_norm": 0.49175822734832764, + "learning_rate": 1e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7520103305578232, + "num_tokens": 855317285.0, + "step": 2975 + }, + { + "epoch": 1.059839715048976, + "grad_norm": 0.4693028926849365, + "learning_rate": 1e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7755252420902252, + "num_tokens": 855596212.0, + "step": 2976 + }, + { + "epoch": 1.0601959038290294, + "grad_norm": 0.49413251876831055, + "learning_rate": 1e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7580691426992416, + "num_tokens": 855870062.0, + "step": 2977 + }, + { + "epoch": 1.0605520926090828, + "grad_norm": 0.45053476095199585, + "learning_rate": 1e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7646878659725189, + "num_tokens": 856168019.0, + "step": 2978 + }, + { + "epoch": 1.0609082813891362, + "grad_norm": 0.47882959246635437, + "learning_rate": 1e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7781332433223724, + "num_tokens": 856443401.0, + "step": 2979 + }, + { + "epoch": 1.0612644701691896, + "grad_norm": 0.46041035652160645, + "learning_rate": 1e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7626692950725555, + "num_tokens": 856734963.0, + "step": 2980 + }, + { + "epoch": 1.0616206589492432, + "grad_norm": 0.5059411525726318, + "learning_rate": 1e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7683224529027939, + "num_tokens": 857040584.0, + "step": 2981 + }, + { + "epoch": 1.0619768477292966, + "grad_norm": 0.46320080757141113, + "learning_rate": 1e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7554801851511002, + "num_tokens": 857340336.0, + "step": 2982 + }, + { + "epoch": 1.06233303650935, + "grad_norm": 0.44846153259277344, + "learning_rate": 1e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.780876561999321, + "num_tokens": 857621857.0, + "step": 2983 + }, + { + "epoch": 1.0626892252894033, + "grad_norm": 0.5221774578094482, + "learning_rate": 1e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7593562602996826, + "num_tokens": 857898563.0, + "step": 2984 + }, + { + "epoch": 1.0630454140694567, + "grad_norm": 0.5181519985198975, + "learning_rate": 1e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.772742822766304, + "num_tokens": 858176992.0, + "step": 2985 + }, + { + "epoch": 1.0634016028495101, + "grad_norm": 0.48323875665664673, + "learning_rate": 1e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7541397958993912, + "num_tokens": 858436510.0, + "step": 2986 + }, + { + "epoch": 1.0637577916295637, + "grad_norm": 0.4996882379055023, + "learning_rate": 1e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7467018812894821, + "num_tokens": 858721709.0, + "step": 2987 + }, + { + "epoch": 1.0641139804096171, + "grad_norm": 0.4853041172027588, + "learning_rate": 1e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7557435780763626, + "num_tokens": 859008655.0, + "step": 2988 + }, + { + "epoch": 1.0644701691896705, + "grad_norm": 0.43354669213294983, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7512247264385223, + "num_tokens": 859296627.0, + "step": 2989 + }, + { + "epoch": 1.064826357969724, + "grad_norm": 0.5035287141799927, + "learning_rate": 1e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7619329988956451, + "num_tokens": 859611843.0, + "step": 2990 + }, + { + "epoch": 1.0651825467497773, + "grad_norm": 0.49025505781173706, + "learning_rate": 1e-06, + "loss": 0.701, + "mean_token_accuracy": 0.7731278240680695, + "num_tokens": 859866823.0, + "step": 2991 + }, + { + "epoch": 1.065538735529831, + "grad_norm": 0.44799643754959106, + "learning_rate": 1e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.7802581042051315, + "num_tokens": 860173131.0, + "step": 2992 + }, + { + "epoch": 1.0658949243098843, + "grad_norm": 0.48015350103378296, + "learning_rate": 1e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7648712396621704, + "num_tokens": 860475063.0, + "step": 2993 + }, + { + "epoch": 1.0662511130899377, + "grad_norm": 0.4893440902233124, + "learning_rate": 1e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.7678961157798767, + "num_tokens": 860760603.0, + "step": 2994 + }, + { + "epoch": 1.066607301869991, + "grad_norm": 0.5033582448959351, + "learning_rate": 1e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7500067800283432, + "num_tokens": 861005805.0, + "step": 2995 + }, + { + "epoch": 1.0669634906500445, + "grad_norm": 0.5016795992851257, + "learning_rate": 1e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7701360285282135, + "num_tokens": 861274915.0, + "step": 2996 + }, + { + "epoch": 1.0673196794300979, + "grad_norm": 0.4825364351272583, + "learning_rate": 1e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7628141641616821, + "num_tokens": 861569180.0, + "step": 2997 + }, + { + "epoch": 1.0676758682101515, + "grad_norm": 0.46605437994003296, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7682561576366425, + "num_tokens": 861854060.0, + "step": 2998 + }, + { + "epoch": 1.0680320569902049, + "grad_norm": 0.5155843496322632, + "learning_rate": 1e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.7744400501251221, + "num_tokens": 862125305.0, + "step": 2999 + }, + { + "epoch": 1.0683882457702583, + "grad_norm": 0.4790986478328705, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7655293345451355, + "num_tokens": 862407623.0, + "step": 3000 + }, + { + "epoch": 1.0687444345503117, + "grad_norm": 0.4861406683921814, + "learning_rate": 1e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7690936625003815, + "num_tokens": 862713058.0, + "step": 3001 + }, + { + "epoch": 1.069100623330365, + "grad_norm": 0.5112716555595398, + "learning_rate": 1e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7702015340328217, + "num_tokens": 862979395.0, + "step": 3002 + }, + { + "epoch": 1.0694568121104184, + "grad_norm": 0.46740537881851196, + "learning_rate": 1e-06, + "loss": 0.7701, + "mean_token_accuracy": 0.7616097629070282, + "num_tokens": 863265895.0, + "step": 3003 + }, + { + "epoch": 1.069813000890472, + "grad_norm": 0.4690558910369873, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7669049501419067, + "num_tokens": 863557197.0, + "step": 3004 + }, + { + "epoch": 1.0701691896705254, + "grad_norm": 0.5147724151611328, + "learning_rate": 1e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.7733833342790604, + "num_tokens": 863851550.0, + "step": 3005 + }, + { + "epoch": 1.0705253784505788, + "grad_norm": 0.4549448788166046, + "learning_rate": 1e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7527302801609039, + "num_tokens": 864140050.0, + "step": 3006 + }, + { + "epoch": 1.0708815672306322, + "grad_norm": 0.47063493728637695, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7520893514156342, + "num_tokens": 864410663.0, + "step": 3007 + }, + { + "epoch": 1.0712377560106856, + "grad_norm": 0.4597645103931427, + "learning_rate": 1e-06, + "loss": 0.7206, + "mean_token_accuracy": 0.7692674994468689, + "num_tokens": 864706137.0, + "step": 3008 + }, + { + "epoch": 1.071593944790739, + "grad_norm": 0.4807433784008026, + "learning_rate": 1e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7590351998806, + "num_tokens": 864981008.0, + "step": 3009 + }, + { + "epoch": 1.0719501335707926, + "grad_norm": 0.48714691400527954, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7599306404590607, + "num_tokens": 865269399.0, + "step": 3010 + }, + { + "epoch": 1.072306322350846, + "grad_norm": 0.4226738512516022, + "learning_rate": 1e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7890309244394302, + "num_tokens": 865576232.0, + "step": 3011 + }, + { + "epoch": 1.0726625111308994, + "grad_norm": 0.47932735085487366, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7743567377328873, + "num_tokens": 865838615.0, + "step": 3012 + }, + { + "epoch": 1.0730186999109528, + "grad_norm": 0.49076101183891296, + "learning_rate": 1e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7621625065803528, + "num_tokens": 866126349.0, + "step": 3013 + }, + { + "epoch": 1.0733748886910062, + "grad_norm": 0.4230869710445404, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7751349359750748, + "num_tokens": 866427072.0, + "step": 3014 + }, + { + "epoch": 1.0737310774710596, + "grad_norm": 0.5362598896026611, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7712421119213104, + "num_tokens": 866709327.0, + "step": 3015 + }, + { + "epoch": 1.0740872662511132, + "grad_norm": 0.45887047052383423, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7617221921682358, + "num_tokens": 867006295.0, + "step": 3016 + }, + { + "epoch": 1.0744434550311666, + "grad_norm": 0.44747263193130493, + "learning_rate": 1e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7677261382341385, + "num_tokens": 867330676.0, + "step": 3017 + }, + { + "epoch": 1.07479964381122, + "grad_norm": 0.45674070715904236, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7619915455579758, + "num_tokens": 867613223.0, + "step": 3018 + }, + { + "epoch": 1.0751558325912733, + "grad_norm": 0.43836715817451477, + "learning_rate": 1e-06, + "loss": 0.759, + "mean_token_accuracy": 0.7638670653104782, + "num_tokens": 867920627.0, + "step": 3019 + }, + { + "epoch": 1.0755120213713267, + "grad_norm": 0.5138468742370605, + "learning_rate": 1e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7756931632757187, + "num_tokens": 868197996.0, + "step": 3020 + }, + { + "epoch": 1.0758682101513801, + "grad_norm": 0.42602643370628357, + "learning_rate": 1e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7700169682502747, + "num_tokens": 868506123.0, + "step": 3021 + }, + { + "epoch": 1.0762243989314337, + "grad_norm": 0.4452275335788727, + "learning_rate": 1e-06, + "loss": 0.701, + "mean_token_accuracy": 0.7772497981786728, + "num_tokens": 868833546.0, + "step": 3022 + }, + { + "epoch": 1.0765805877114871, + "grad_norm": 0.44951239228248596, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7675650417804718, + "num_tokens": 869144558.0, + "step": 3023 + }, + { + "epoch": 1.0769367764915405, + "grad_norm": 0.47085076570510864, + "learning_rate": 1e-06, + "loss": 0.7781, + "mean_token_accuracy": 0.7617430090904236, + "num_tokens": 869476904.0, + "step": 3024 + }, + { + "epoch": 1.077292965271594, + "grad_norm": 0.46154525876045227, + "learning_rate": 1e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7783127576112747, + "num_tokens": 869778351.0, + "step": 3025 + }, + { + "epoch": 1.0776491540516473, + "grad_norm": 0.4851115345954895, + "learning_rate": 1e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7507282048463821, + "num_tokens": 870099244.0, + "step": 3026 + }, + { + "epoch": 1.078005342831701, + "grad_norm": 0.4672606289386749, + "learning_rate": 1e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.7779197841882706, + "num_tokens": 870381019.0, + "step": 3027 + }, + { + "epoch": 1.0783615316117543, + "grad_norm": 0.5001000165939331, + "learning_rate": 1e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7592483460903168, + "num_tokens": 870665706.0, + "step": 3028 + }, + { + "epoch": 1.0787177203918077, + "grad_norm": 0.44871270656585693, + "learning_rate": 1e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.77508544921875, + "num_tokens": 870959821.0, + "step": 3029 + }, + { + "epoch": 1.079073909171861, + "grad_norm": 0.4840683937072754, + "learning_rate": 1e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.763529509305954, + "num_tokens": 871222259.0, + "step": 3030 + }, + { + "epoch": 1.0794300979519145, + "grad_norm": 0.5139626860618591, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7633890807628632, + "num_tokens": 871478784.0, + "step": 3031 + }, + { + "epoch": 1.0797862867319679, + "grad_norm": 0.4867606461048126, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7402324974536896, + "num_tokens": 871757209.0, + "step": 3032 + }, + { + "epoch": 1.0801424755120215, + "grad_norm": 0.5303176045417786, + "learning_rate": 1e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7739688456058502, + "num_tokens": 872048517.0, + "step": 3033 + }, + { + "epoch": 1.0804986642920749, + "grad_norm": 0.5006769299507141, + "learning_rate": 1e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.7783440053462982, + "num_tokens": 872324883.0, + "step": 3034 + }, + { + "epoch": 1.0808548530721283, + "grad_norm": 0.4677661955356598, + "learning_rate": 1e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7814194709062576, + "num_tokens": 872622872.0, + "step": 3035 + }, + { + "epoch": 1.0812110418521816, + "grad_norm": 0.4809877574443817, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7588445544242859, + "num_tokens": 872892473.0, + "step": 3036 + }, + { + "epoch": 1.081567230632235, + "grad_norm": 0.4741208255290985, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.762847363948822, + "num_tokens": 873191435.0, + "step": 3037 + }, + { + "epoch": 1.0819234194122884, + "grad_norm": 0.5005366206169128, + "learning_rate": 1e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7720138430595398, + "num_tokens": 873487466.0, + "step": 3038 + }, + { + "epoch": 1.082279608192342, + "grad_norm": 0.5104890465736389, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7659303396940231, + "num_tokens": 873755795.0, + "step": 3039 + }, + { + "epoch": 1.0826357969723954, + "grad_norm": 0.4449548125267029, + "learning_rate": 1e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7609515637159348, + "num_tokens": 874069193.0, + "step": 3040 + }, + { + "epoch": 1.0829919857524488, + "grad_norm": 0.4812804162502289, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7619089037179947, + "num_tokens": 874358525.0, + "step": 3041 + }, + { + "epoch": 1.0833481745325022, + "grad_norm": 0.5020546913146973, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7706896066665649, + "num_tokens": 874641517.0, + "step": 3042 + }, + { + "epoch": 1.0837043633125556, + "grad_norm": 0.4634293019771576, + "learning_rate": 1e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7798388600349426, + "num_tokens": 874931232.0, + "step": 3043 + }, + { + "epoch": 1.084060552092609, + "grad_norm": 0.4555800259113312, + "learning_rate": 1e-06, + "loss": 0.6949, + "mean_token_accuracy": 0.776510164141655, + "num_tokens": 875216815.0, + "step": 3044 + }, + { + "epoch": 1.0844167408726626, + "grad_norm": 0.4662371277809143, + "learning_rate": 1e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7583825141191483, + "num_tokens": 875504700.0, + "step": 3045 + }, + { + "epoch": 1.084772929652716, + "grad_norm": 0.47289109230041504, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7557858824729919, + "num_tokens": 875807511.0, + "step": 3046 + }, + { + "epoch": 1.0851291184327694, + "grad_norm": 0.47925955057144165, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7587823867797852, + "num_tokens": 876100465.0, + "step": 3047 + }, + { + "epoch": 1.0854853072128228, + "grad_norm": 0.5209761261940002, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7649736255407333, + "num_tokens": 876389901.0, + "step": 3048 + }, + { + "epoch": 1.0858414959928762, + "grad_norm": 0.4808775782585144, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7552674859762192, + "num_tokens": 876677039.0, + "step": 3049 + }, + { + "epoch": 1.0861976847729296, + "grad_norm": 0.4064384400844574, + "learning_rate": 1e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7608519792556763, + "num_tokens": 876997934.0, + "step": 3050 + }, + { + "epoch": 1.0865538735529832, + "grad_norm": 0.4629940986633301, + "learning_rate": 1e-06, + "loss": 0.691, + "mean_token_accuracy": 0.7778143137693405, + "num_tokens": 877294193.0, + "step": 3051 + }, + { + "epoch": 1.0869100623330366, + "grad_norm": 0.5196875333786011, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7390934377908707, + "num_tokens": 877547827.0, + "step": 3052 + }, + { + "epoch": 1.08726625111309, + "grad_norm": 0.4676130414009094, + "learning_rate": 1e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7527557164430618, + "num_tokens": 877814008.0, + "step": 3053 + }, + { + "epoch": 1.0876224398931433, + "grad_norm": 0.5515897870063782, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7549661099910736, + "num_tokens": 878079158.0, + "step": 3054 + }, + { + "epoch": 1.0879786286731967, + "grad_norm": 0.473436564207077, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7558727115392685, + "num_tokens": 878367387.0, + "step": 3055 + }, + { + "epoch": 1.0883348174532501, + "grad_norm": 0.5174038410186768, + "learning_rate": 1e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.775843545794487, + "num_tokens": 878645667.0, + "step": 3056 + }, + { + "epoch": 1.0886910062333037, + "grad_norm": 0.49840134382247925, + "learning_rate": 1e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7574822157621384, + "num_tokens": 878936660.0, + "step": 3057 + }, + { + "epoch": 1.0890471950133571, + "grad_norm": 0.5133033990859985, + "learning_rate": 1e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7598986178636551, + "num_tokens": 879187061.0, + "step": 3058 + }, + { + "epoch": 1.0894033837934105, + "grad_norm": 0.4224367141723633, + "learning_rate": 1e-06, + "loss": 0.6653, + "mean_token_accuracy": 0.7813328504562378, + "num_tokens": 879480168.0, + "step": 3059 + }, + { + "epoch": 1.089759572573464, + "grad_norm": 0.49234867095947266, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7557874172925949, + "num_tokens": 879769778.0, + "step": 3060 + }, + { + "epoch": 1.0901157613535173, + "grad_norm": 0.5415819883346558, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7712894678115845, + "num_tokens": 880058848.0, + "step": 3061 + }, + { + "epoch": 1.090471950133571, + "grad_norm": 0.4734170436859131, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7402882128953934, + "num_tokens": 880353421.0, + "step": 3062 + }, + { + "epoch": 1.0908281389136243, + "grad_norm": 0.4665142297744751, + "learning_rate": 1e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.7837092727422714, + "num_tokens": 880660376.0, + "step": 3063 + }, + { + "epoch": 1.0911843276936777, + "grad_norm": 0.44831401109695435, + "learning_rate": 1e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.769917756319046, + "num_tokens": 880976423.0, + "step": 3064 + }, + { + "epoch": 1.091540516473731, + "grad_norm": 0.4602123200893402, + "learning_rate": 1e-06, + "loss": 0.7803, + "mean_token_accuracy": 0.7528553456068039, + "num_tokens": 881262840.0, + "step": 3065 + }, + { + "epoch": 1.0918967052537845, + "grad_norm": 0.4747543931007385, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7659401595592499, + "num_tokens": 881553243.0, + "step": 3066 + }, + { + "epoch": 1.0922528940338379, + "grad_norm": 0.5384807586669922, + "learning_rate": 1e-06, + "loss": 0.7468, + "mean_token_accuracy": 0.7647627294063568, + "num_tokens": 881872292.0, + "step": 3067 + }, + { + "epoch": 1.0926090828138915, + "grad_norm": 0.42628124356269836, + "learning_rate": 1e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7723193764686584, + "num_tokens": 882172450.0, + "step": 3068 + }, + { + "epoch": 1.0929652715939449, + "grad_norm": 0.5014832615852356, + "learning_rate": 1e-06, + "loss": 0.6949, + "mean_token_accuracy": 0.7742990404367447, + "num_tokens": 882426986.0, + "step": 3069 + }, + { + "epoch": 1.0933214603739982, + "grad_norm": 0.43307211995124817, + "learning_rate": 1e-06, + "loss": 0.684, + "mean_token_accuracy": 0.7822532802820206, + "num_tokens": 882748437.0, + "step": 3070 + }, + { + "epoch": 1.0936776491540516, + "grad_norm": 0.4906090795993805, + "learning_rate": 1e-06, + "loss": 0.7662, + "mean_token_accuracy": 0.758651927113533, + "num_tokens": 883062835.0, + "step": 3071 + }, + { + "epoch": 1.094033837934105, + "grad_norm": 0.5738362073898315, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7727567404508591, + "num_tokens": 883314996.0, + "step": 3072 + }, + { + "epoch": 1.0943900267141584, + "grad_norm": 0.4407784938812256, + "learning_rate": 1e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7600050866603851, + "num_tokens": 883627222.0, + "step": 3073 + }, + { + "epoch": 1.094746215494212, + "grad_norm": 0.47086918354034424, + "learning_rate": 1e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7710106372833252, + "num_tokens": 883884814.0, + "step": 3074 + }, + { + "epoch": 1.0951024042742654, + "grad_norm": 0.49703535437583923, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7633068263530731, + "num_tokens": 884172161.0, + "step": 3075 + }, + { + "epoch": 1.0954585930543188, + "grad_norm": 0.506182074546814, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7609056234359741, + "num_tokens": 884435994.0, + "step": 3076 + }, + { + "epoch": 1.0958147818343722, + "grad_norm": 0.4547540843486786, + "learning_rate": 1e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7788371741771698, + "num_tokens": 884696982.0, + "step": 3077 + }, + { + "epoch": 1.0961709706144256, + "grad_norm": 0.4715493321418762, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7584625482559204, + "num_tokens": 884975348.0, + "step": 3078 + }, + { + "epoch": 1.096527159394479, + "grad_norm": 0.5377711057662964, + "learning_rate": 1e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7774156779050827, + "num_tokens": 885257342.0, + "step": 3079 + }, + { + "epoch": 1.0968833481745326, + "grad_norm": 0.45441073179244995, + "learning_rate": 1e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7796885371208191, + "num_tokens": 885536189.0, + "step": 3080 + }, + { + "epoch": 1.097239536954586, + "grad_norm": 0.4843765199184418, + "learning_rate": 1e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7547367066144943, + "num_tokens": 885833087.0, + "step": 3081 + }, + { + "epoch": 1.0975957257346394, + "grad_norm": 0.45273372530937195, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7725218832492828, + "num_tokens": 886132208.0, + "step": 3082 + }, + { + "epoch": 1.0979519145146928, + "grad_norm": 0.46899673342704773, + "learning_rate": 1e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.7686246782541275, + "num_tokens": 886412110.0, + "step": 3083 + }, + { + "epoch": 1.0983081032947462, + "grad_norm": 0.4944090247154236, + "learning_rate": 1e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7588013708591461, + "num_tokens": 886704143.0, + "step": 3084 + }, + { + "epoch": 1.0986642920747995, + "grad_norm": 0.48508721590042114, + "learning_rate": 1e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7398175001144409, + "num_tokens": 887004474.0, + "step": 3085 + }, + { + "epoch": 1.0990204808548532, + "grad_norm": 0.46011170744895935, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.760183647274971, + "num_tokens": 887286342.0, + "step": 3086 + }, + { + "epoch": 1.0993766696349065, + "grad_norm": 0.47859352827072144, + "learning_rate": 1e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7694175094366074, + "num_tokens": 887558640.0, + "step": 3087 + }, + { + "epoch": 1.09973285841496, + "grad_norm": 0.4755301773548126, + "learning_rate": 1e-06, + "loss": 0.6756, + "mean_token_accuracy": 0.7859136909246445, + "num_tokens": 887891069.0, + "step": 3088 + }, + { + "epoch": 1.1000890471950133, + "grad_norm": 0.43953338265419006, + "learning_rate": 1e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.7815170735120773, + "num_tokens": 888197501.0, + "step": 3089 + }, + { + "epoch": 1.1004452359750667, + "grad_norm": 0.5052156448364258, + "learning_rate": 1e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7612350434064865, + "num_tokens": 888418106.0, + "step": 3090 + }, + { + "epoch": 1.10080142475512, + "grad_norm": 0.48849907517433167, + "learning_rate": 1e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7501756101846695, + "num_tokens": 888727402.0, + "step": 3091 + }, + { + "epoch": 1.1011576135351737, + "grad_norm": 0.46829476952552795, + "learning_rate": 1e-06, + "loss": 0.6926, + "mean_token_accuracy": 0.7733038514852524, + "num_tokens": 889004486.0, + "step": 3092 + }, + { + "epoch": 1.101513802315227, + "grad_norm": 0.440317302942276, + "learning_rate": 1e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7780640572309494, + "num_tokens": 889304516.0, + "step": 3093 + }, + { + "epoch": 1.1018699910952805, + "grad_norm": 0.48926812410354614, + "learning_rate": 1e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7674062997102737, + "num_tokens": 889602950.0, + "step": 3094 + }, + { + "epoch": 1.102226179875334, + "grad_norm": 0.5011337399482727, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7695414423942566, + "num_tokens": 889916039.0, + "step": 3095 + }, + { + "epoch": 1.1025823686553873, + "grad_norm": 0.45732226967811584, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7714496105909348, + "num_tokens": 890199008.0, + "step": 3096 + }, + { + "epoch": 1.102938557435441, + "grad_norm": 0.47862741351127625, + "learning_rate": 1e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.7573269903659821, + "num_tokens": 890506399.0, + "step": 3097 + }, + { + "epoch": 1.1032947462154943, + "grad_norm": 0.5160518288612366, + "learning_rate": 1e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7702709138393402, + "num_tokens": 890806006.0, + "step": 3098 + }, + { + "epoch": 1.1036509349955477, + "grad_norm": 0.5047280192375183, + "learning_rate": 1e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7739361077547073, + "num_tokens": 891058016.0, + "step": 3099 + }, + { + "epoch": 1.104007123775601, + "grad_norm": 0.4509919583797455, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7648809403181076, + "num_tokens": 891374408.0, + "step": 3100 + }, + { + "epoch": 1.1043633125556545, + "grad_norm": 0.5039306879043579, + "learning_rate": 1e-06, + "loss": 0.7662, + "mean_token_accuracy": 0.7608290761709213, + "num_tokens": 891632762.0, + "step": 3101 + }, + { + "epoch": 1.1047195013357078, + "grad_norm": 0.5104768872261047, + "learning_rate": 1e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.77702896296978, + "num_tokens": 891909928.0, + "step": 3102 + }, + { + "epoch": 1.1050756901157615, + "grad_norm": 0.495701402425766, + "learning_rate": 1e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7642497420310974, + "num_tokens": 892198609.0, + "step": 3103 + }, + { + "epoch": 1.1054318788958148, + "grad_norm": 0.4920479953289032, + "learning_rate": 1e-06, + "loss": 0.6888, + "mean_token_accuracy": 0.7771295607089996, + "num_tokens": 892490949.0, + "step": 3104 + }, + { + "epoch": 1.1057880676758682, + "grad_norm": 0.45178818702697754, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7605077922344208, + "num_tokens": 892801236.0, + "step": 3105 + }, + { + "epoch": 1.1061442564559216, + "grad_norm": 0.47008252143859863, + "learning_rate": 1e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.754898339509964, + "num_tokens": 893091804.0, + "step": 3106 + }, + { + "epoch": 1.106500445235975, + "grad_norm": 0.4738427698612213, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7612613439559937, + "num_tokens": 893377319.0, + "step": 3107 + }, + { + "epoch": 1.1068566340160284, + "grad_norm": 0.5084293484687805, + "learning_rate": 1e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7670498639345169, + "num_tokens": 893662127.0, + "step": 3108 + }, + { + "epoch": 1.107212822796082, + "grad_norm": 0.47637856006622314, + "learning_rate": 1e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7620210200548172, + "num_tokens": 893929244.0, + "step": 3109 + }, + { + "epoch": 1.1075690115761354, + "grad_norm": 0.42683371901512146, + "learning_rate": 1e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7702426463365555, + "num_tokens": 894227078.0, + "step": 3110 + }, + { + "epoch": 1.1079252003561888, + "grad_norm": 0.48906639218330383, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7395511418581009, + "num_tokens": 894550230.0, + "step": 3111 + }, + { + "epoch": 1.1082813891362422, + "grad_norm": 0.4726780652999878, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7606839239597321, + "num_tokens": 894860199.0, + "step": 3112 + }, + { + "epoch": 1.1086375779162956, + "grad_norm": 0.46174702048301697, + "learning_rate": 1e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.769465371966362, + "num_tokens": 895149318.0, + "step": 3113 + }, + { + "epoch": 1.108993766696349, + "grad_norm": 0.4425880014896393, + "learning_rate": 1e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7732369303703308, + "num_tokens": 895444993.0, + "step": 3114 + }, + { + "epoch": 1.1093499554764026, + "grad_norm": 0.4833201766014099, + "learning_rate": 1e-06, + "loss": 0.7449, + "mean_token_accuracy": 0.7630178481340408, + "num_tokens": 895725225.0, + "step": 3115 + }, + { + "epoch": 1.109706144256456, + "grad_norm": 0.44762808084487915, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.763847753405571, + "num_tokens": 896026061.0, + "step": 3116 + }, + { + "epoch": 1.1100623330365094, + "grad_norm": 0.4747490882873535, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7749955654144287, + "num_tokens": 896312397.0, + "step": 3117 + }, + { + "epoch": 1.1104185218165628, + "grad_norm": 0.5082911849021912, + "learning_rate": 1e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7632592767477036, + "num_tokens": 896563467.0, + "step": 3118 + }, + { + "epoch": 1.1107747105966161, + "grad_norm": 0.4510814845561981, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7751310616731644, + "num_tokens": 896868056.0, + "step": 3119 + }, + { + "epoch": 1.1111308993766695, + "grad_norm": 0.4820919334888458, + "learning_rate": 1e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.7658707648515701, + "num_tokens": 897163273.0, + "step": 3120 + }, + { + "epoch": 1.1114870881567231, + "grad_norm": 0.43996956944465637, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7762825340032578, + "num_tokens": 897494357.0, + "step": 3121 + }, + { + "epoch": 1.1118432769367765, + "grad_norm": 0.5065517425537109, + "learning_rate": 1e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7802840620279312, + "num_tokens": 897767133.0, + "step": 3122 + }, + { + "epoch": 1.11219946571683, + "grad_norm": 0.49759596586227417, + "learning_rate": 1e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7674534916877747, + "num_tokens": 898054588.0, + "step": 3123 + }, + { + "epoch": 1.1125556544968833, + "grad_norm": 0.537344217300415, + "learning_rate": 1e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7627329528331757, + "num_tokens": 898334318.0, + "step": 3124 + }, + { + "epoch": 1.1129118432769367, + "grad_norm": 0.4708343744277954, + "learning_rate": 1e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7802080661058426, + "num_tokens": 898642117.0, + "step": 3125 + }, + { + "epoch": 1.11326803205699, + "grad_norm": 0.45680245757102966, + "learning_rate": 1e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7651673704385757, + "num_tokens": 898950772.0, + "step": 3126 + }, + { + "epoch": 1.1136242208370437, + "grad_norm": 0.4520605504512787, + "learning_rate": 1e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7782044261693954, + "num_tokens": 899227242.0, + "step": 3127 + }, + { + "epoch": 1.113980409617097, + "grad_norm": 0.5057541131973267, + "learning_rate": 1e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.7839710265398026, + "num_tokens": 899526550.0, + "step": 3128 + }, + { + "epoch": 1.1143365983971505, + "grad_norm": 0.5176829695701599, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7461500912904739, + "num_tokens": 899779190.0, + "step": 3129 + }, + { + "epoch": 1.1146927871772039, + "grad_norm": 0.4897320866584778, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7687669694423676, + "num_tokens": 900045244.0, + "step": 3130 + }, + { + "epoch": 1.1150489759572573, + "grad_norm": 0.49156591296195984, + "learning_rate": 1e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7711964994668961, + "num_tokens": 900325332.0, + "step": 3131 + }, + { + "epoch": 1.1154051647373109, + "grad_norm": 0.4780857264995575, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7647912055253983, + "num_tokens": 900615513.0, + "step": 3132 + }, + { + "epoch": 1.1157613535173643, + "grad_norm": 0.44790422916412354, + "learning_rate": 1e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7661565989255905, + "num_tokens": 900944154.0, + "step": 3133 + }, + { + "epoch": 1.1161175422974177, + "grad_norm": 0.5304303169250488, + "learning_rate": 1e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7508900165557861, + "num_tokens": 901197100.0, + "step": 3134 + }, + { + "epoch": 1.116473731077471, + "grad_norm": 0.45095697045326233, + "learning_rate": 1e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.7797900438308716, + "num_tokens": 901488401.0, + "step": 3135 + }, + { + "epoch": 1.1168299198575244, + "grad_norm": 0.46532198786735535, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7584024667739868, + "num_tokens": 901790043.0, + "step": 3136 + }, + { + "epoch": 1.1171861086375778, + "grad_norm": 0.43772879242897034, + "learning_rate": 1e-06, + "loss": 0.709, + "mean_token_accuracy": 0.777539074420929, + "num_tokens": 902056897.0, + "step": 3137 + }, + { + "epoch": 1.1175422974176314, + "grad_norm": 0.45673471689224243, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7731942385435104, + "num_tokens": 902356483.0, + "step": 3138 + }, + { + "epoch": 1.1178984861976848, + "grad_norm": 0.46782198548316956, + "learning_rate": 1e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.764721468091011, + "num_tokens": 902651435.0, + "step": 3139 + }, + { + "epoch": 1.1182546749777382, + "grad_norm": 0.4665337800979614, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7619379013776779, + "num_tokens": 902943031.0, + "step": 3140 + }, + { + "epoch": 1.1186108637577916, + "grad_norm": 0.44048255681991577, + "learning_rate": 1e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7743213474750519, + "num_tokens": 903253250.0, + "step": 3141 + }, + { + "epoch": 1.118967052537845, + "grad_norm": 0.4980960488319397, + "learning_rate": 1e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7428091317415237, + "num_tokens": 903521011.0, + "step": 3142 + }, + { + "epoch": 1.1193232413178984, + "grad_norm": 0.4407278001308441, + "learning_rate": 1e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.7798610925674438, + "num_tokens": 903831690.0, + "step": 3143 + }, + { + "epoch": 1.119679430097952, + "grad_norm": 0.4349294900894165, + "learning_rate": 1e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.7665667682886124, + "num_tokens": 904151639.0, + "step": 3144 + }, + { + "epoch": 1.1200356188780054, + "grad_norm": 0.528833270072937, + "learning_rate": 1e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7661204040050507, + "num_tokens": 904405534.0, + "step": 3145 + }, + { + "epoch": 1.1203918076580588, + "grad_norm": 0.5028890371322632, + "learning_rate": 1e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7816711366176605, + "num_tokens": 904703102.0, + "step": 3146 + }, + { + "epoch": 1.1207479964381122, + "grad_norm": 0.4834579825401306, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.76694755256176, + "num_tokens": 904990167.0, + "step": 3147 + }, + { + "epoch": 1.1211041852181656, + "grad_norm": 0.4627705216407776, + "learning_rate": 1e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7669654041528702, + "num_tokens": 905262145.0, + "step": 3148 + }, + { + "epoch": 1.121460373998219, + "grad_norm": 0.4597424864768982, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7675302624702454, + "num_tokens": 905517431.0, + "step": 3149 + }, + { + "epoch": 1.1218165627782726, + "grad_norm": 0.46387889981269836, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7641491889953613, + "num_tokens": 905795082.0, + "step": 3150 + }, + { + "epoch": 1.122172751558326, + "grad_norm": 0.4623622000217438, + "learning_rate": 1e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7793403416872025, + "num_tokens": 906060469.0, + "step": 3151 + }, + { + "epoch": 1.1225289403383794, + "grad_norm": 0.47346413135528564, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7688294798135757, + "num_tokens": 906331430.0, + "step": 3152 + }, + { + "epoch": 1.1228851291184327, + "grad_norm": 0.47725892066955566, + "learning_rate": 1e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.776993602514267, + "num_tokens": 906631023.0, + "step": 3153 + }, + { + "epoch": 1.1232413178984861, + "grad_norm": 0.48205968737602234, + "learning_rate": 1e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7515400350093842, + "num_tokens": 906942197.0, + "step": 3154 + }, + { + "epoch": 1.1235975066785395, + "grad_norm": 0.4434294104576111, + "learning_rate": 1e-06, + "loss": 0.6822, + "mean_token_accuracy": 0.7840927988290787, + "num_tokens": 907222550.0, + "step": 3155 + }, + { + "epoch": 1.1239536954585931, + "grad_norm": 0.46914049983024597, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7641686350107193, + "num_tokens": 907489079.0, + "step": 3156 + }, + { + "epoch": 1.1243098842386465, + "grad_norm": 0.45881199836730957, + "learning_rate": 1e-06, + "loss": 0.7621, + "mean_token_accuracy": 0.7625598460435867, + "num_tokens": 907763043.0, + "step": 3157 + }, + { + "epoch": 1.1246660730187, + "grad_norm": 0.46654438972473145, + "learning_rate": 1e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7645247727632523, + "num_tokens": 908067541.0, + "step": 3158 + }, + { + "epoch": 1.1250222617987533, + "grad_norm": 0.44877713918685913, + "learning_rate": 1e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7630733400583267, + "num_tokens": 908352031.0, + "step": 3159 + }, + { + "epoch": 1.1253784505788067, + "grad_norm": 0.47346624732017517, + "learning_rate": 1e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7722821086645126, + "num_tokens": 908669554.0, + "step": 3160 + }, + { + "epoch": 1.12573463935886, + "grad_norm": 0.46918150782585144, + "learning_rate": 1e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7577698528766632, + "num_tokens": 908990412.0, + "step": 3161 + }, + { + "epoch": 1.1260908281389137, + "grad_norm": 0.48292842507362366, + "learning_rate": 1e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7640081942081451, + "num_tokens": 909261835.0, + "step": 3162 + }, + { + "epoch": 1.126447016918967, + "grad_norm": 0.45544153451919556, + "learning_rate": 1e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7576791793107986, + "num_tokens": 909562105.0, + "step": 3163 + }, + { + "epoch": 1.1268032056990205, + "grad_norm": 0.49718841910362244, + "learning_rate": 1e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7629114389419556, + "num_tokens": 909835385.0, + "step": 3164 + }, + { + "epoch": 1.1271593944790739, + "grad_norm": 0.4227934181690216, + "learning_rate": 1e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7535708695650101, + "num_tokens": 910123538.0, + "step": 3165 + }, + { + "epoch": 1.1275155832591273, + "grad_norm": 0.5058624148368835, + "learning_rate": 1e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7542123347520828, + "num_tokens": 910382136.0, + "step": 3166 + }, + { + "epoch": 1.1278717720391809, + "grad_norm": 0.4321843087673187, + "learning_rate": 1e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.7861779779195786, + "num_tokens": 910697902.0, + "step": 3167 + }, + { + "epoch": 1.1282279608192343, + "grad_norm": 0.45333054661750793, + "learning_rate": 1e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.7632837742567062, + "num_tokens": 911003480.0, + "step": 3168 + }, + { + "epoch": 1.1285841495992877, + "grad_norm": 0.47910287976264954, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7560255974531174, + "num_tokens": 911292797.0, + "step": 3169 + }, + { + "epoch": 1.128940338379341, + "grad_norm": 0.5102831125259399, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7645800858736038, + "num_tokens": 911599484.0, + "step": 3170 + }, + { + "epoch": 1.1292965271593944, + "grad_norm": 0.49650630354881287, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7738558650016785, + "num_tokens": 911900889.0, + "step": 3171 + }, + { + "epoch": 1.1296527159394478, + "grad_norm": 0.44509121775627136, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7710760235786438, + "num_tokens": 912206652.0, + "step": 3172 + }, + { + "epoch": 1.1300089047195012, + "grad_norm": 0.47961464524269104, + "learning_rate": 1e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.7871579825878143, + "num_tokens": 912509235.0, + "step": 3173 + }, + { + "epoch": 1.1303650934995548, + "grad_norm": 0.532471239566803, + "learning_rate": 1e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.766332820057869, + "num_tokens": 912748340.0, + "step": 3174 + }, + { + "epoch": 1.1307212822796082, + "grad_norm": 0.4978734850883484, + "learning_rate": 1e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7710257172584534, + "num_tokens": 913016467.0, + "step": 3175 + }, + { + "epoch": 1.1310774710596616, + "grad_norm": 0.433918297290802, + "learning_rate": 1e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7752398699522018, + "num_tokens": 913326176.0, + "step": 3176 + }, + { + "epoch": 1.131433659839715, + "grad_norm": 0.5286362767219543, + "learning_rate": 1e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.767452284693718, + "num_tokens": 913621909.0, + "step": 3177 + }, + { + "epoch": 1.1317898486197684, + "grad_norm": 0.49276748299598694, + "learning_rate": 1e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7690747529268265, + "num_tokens": 913921747.0, + "step": 3178 + }, + { + "epoch": 1.132146037399822, + "grad_norm": 0.557184636592865, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7704529017210007, + "num_tokens": 914193925.0, + "step": 3179 + }, + { + "epoch": 1.1325022261798754, + "grad_norm": 0.517868161201477, + "learning_rate": 1e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7638819515705109, + "num_tokens": 914539813.0, + "step": 3180 + }, + { + "epoch": 1.1328584149599288, + "grad_norm": 0.49509093165397644, + "learning_rate": 1e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7567457109689713, + "num_tokens": 914836534.0, + "step": 3181 + }, + { + "epoch": 1.1332146037399822, + "grad_norm": 0.44149842858314514, + "learning_rate": 1e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7607140839099884, + "num_tokens": 915117725.0, + "step": 3182 + }, + { + "epoch": 1.1335707925200356, + "grad_norm": 0.4924381673336029, + "learning_rate": 1e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7728259116411209, + "num_tokens": 915417142.0, + "step": 3183 + }, + { + "epoch": 1.133926981300089, + "grad_norm": 0.4960727393627167, + "learning_rate": 1e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7624844014644623, + "num_tokens": 915691259.0, + "step": 3184 + }, + { + "epoch": 1.1342831700801426, + "grad_norm": 0.49032407999038696, + "learning_rate": 1e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.7973824888467789, + "num_tokens": 915989099.0, + "step": 3185 + }, + { + "epoch": 1.134639358860196, + "grad_norm": 0.4876800775527954, + "learning_rate": 1e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7608355581760406, + "num_tokens": 916286258.0, + "step": 3186 + }, + { + "epoch": 1.1349955476402493, + "grad_norm": 0.4891667068004608, + "learning_rate": 1e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7759765982627869, + "num_tokens": 916595431.0, + "step": 3187 + }, + { + "epoch": 1.1353517364203027, + "grad_norm": 0.47717583179473877, + "learning_rate": 1e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7564340233802795, + "num_tokens": 916911838.0, + "step": 3188 + }, + { + "epoch": 1.1357079252003561, + "grad_norm": 0.5197486877441406, + "learning_rate": 1e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7704198658466339, + "num_tokens": 917213429.0, + "step": 3189 + }, + { + "epoch": 1.1360641139804097, + "grad_norm": 0.4449552297592163, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7703706175088882, + "num_tokens": 917521795.0, + "step": 3190 + }, + { + "epoch": 1.1364203027604631, + "grad_norm": 0.48936399817466736, + "learning_rate": 1e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7653624713420868, + "num_tokens": 917819356.0, + "step": 3191 + }, + { + "epoch": 1.1367764915405165, + "grad_norm": 0.6546713709831238, + "learning_rate": 1e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7671235054731369, + "num_tokens": 918138120.0, + "step": 3192 + }, + { + "epoch": 1.13713268032057, + "grad_norm": 0.4596002399921417, + "learning_rate": 1e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7547439783811569, + "num_tokens": 918448661.0, + "step": 3193 + }, + { + "epoch": 1.1374888691006233, + "grad_norm": 0.4226129353046417, + "learning_rate": 1e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7749307751655579, + "num_tokens": 918758687.0, + "step": 3194 + }, + { + "epoch": 1.1378450578806767, + "grad_norm": 0.5630263686180115, + "learning_rate": 1e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7609509378671646, + "num_tokens": 919008176.0, + "step": 3195 + }, + { + "epoch": 1.13820124666073, + "grad_norm": 0.4911596477031708, + "learning_rate": 1e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7784104943275452, + "num_tokens": 919289188.0, + "step": 3196 + }, + { + "epoch": 1.1385574354407837, + "grad_norm": 0.4966540038585663, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7576402425765991, + "num_tokens": 919541909.0, + "step": 3197 + }, + { + "epoch": 1.138913624220837, + "grad_norm": 0.45895275473594666, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7643071413040161, + "num_tokens": 919821580.0, + "step": 3198 + }, + { + "epoch": 1.1392698130008905, + "grad_norm": 0.4816399812698364, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7612667828798294, + "num_tokens": 920123112.0, + "step": 3199 + }, + { + "epoch": 1.1396260017809439, + "grad_norm": 0.405872106552124, + "learning_rate": 1e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7716707289218903, + "num_tokens": 920414389.0, + "step": 3200 + }, + { + "epoch": 1.1399821905609973, + "grad_norm": 0.4584169089794159, + "learning_rate": 1e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7517463564872742, + "num_tokens": 920707788.0, + "step": 3201 + }, + { + "epoch": 1.1403383793410509, + "grad_norm": 0.5103431940078735, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7325493842363358, + "num_tokens": 920975048.0, + "step": 3202 + }, + { + "epoch": 1.1406945681211043, + "grad_norm": 0.48653632402420044, + "learning_rate": 1e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7791712880134583, + "num_tokens": 921300725.0, + "step": 3203 + }, + { + "epoch": 1.1410507569011576, + "grad_norm": 0.5181437730789185, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.755754142999649, + "num_tokens": 921571792.0, + "step": 3204 + }, + { + "epoch": 1.141406945681211, + "grad_norm": 0.45352378487586975, + "learning_rate": 1e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7721519619226456, + "num_tokens": 921847049.0, + "step": 3205 + }, + { + "epoch": 1.1417631344612644, + "grad_norm": 0.44279295206069946, + "learning_rate": 1e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7725668549537659, + "num_tokens": 922145874.0, + "step": 3206 + }, + { + "epoch": 1.1421193232413178, + "grad_norm": 0.47509312629699707, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7627657055854797, + "num_tokens": 922411185.0, + "step": 3207 + }, + { + "epoch": 1.1424755120213712, + "grad_norm": 0.48560237884521484, + "learning_rate": 1e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7521076947450638, + "num_tokens": 922669496.0, + "step": 3208 + }, + { + "epoch": 1.1428317008014248, + "grad_norm": 0.47519487142562866, + "learning_rate": 1e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.7731393277645111, + "num_tokens": 922951018.0, + "step": 3209 + }, + { + "epoch": 1.1431878895814782, + "grad_norm": 0.5112836956977844, + "learning_rate": 1e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7620871067047119, + "num_tokens": 923248119.0, + "step": 3210 + }, + { + "epoch": 1.1435440783615316, + "grad_norm": 0.46260157227516174, + "learning_rate": 1e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7712306678295135, + "num_tokens": 923504393.0, + "step": 3211 + }, + { + "epoch": 1.143900267141585, + "grad_norm": 0.4772984981536865, + "learning_rate": 1e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7711443603038788, + "num_tokens": 923803554.0, + "step": 3212 + }, + { + "epoch": 1.1442564559216384, + "grad_norm": 0.4586586654186249, + "learning_rate": 1e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7613513171672821, + "num_tokens": 924122989.0, + "step": 3213 + }, + { + "epoch": 1.144612644701692, + "grad_norm": 0.47064337134361267, + "learning_rate": 1e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.768243670463562, + "num_tokens": 924403490.0, + "step": 3214 + }, + { + "epoch": 1.1449688334817454, + "grad_norm": 0.534736692905426, + "learning_rate": 1e-06, + "loss": 0.7159, + "mean_token_accuracy": 0.7695194482803345, + "num_tokens": 924654475.0, + "step": 3215 + }, + { + "epoch": 1.1453250222617988, + "grad_norm": 0.47856608033180237, + "learning_rate": 1e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7656987011432648, + "num_tokens": 924926260.0, + "step": 3216 + }, + { + "epoch": 1.1456812110418522, + "grad_norm": 0.47710418701171875, + "learning_rate": 1e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7566419243812561, + "num_tokens": 925218009.0, + "step": 3217 + }, + { + "epoch": 1.1460373998219056, + "grad_norm": 0.47296831011772156, + "learning_rate": 1e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7730209976434708, + "num_tokens": 925511403.0, + "step": 3218 + }, + { + "epoch": 1.146393588601959, + "grad_norm": 0.45834022760391235, + "learning_rate": 1e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7544309198856354, + "num_tokens": 925855589.0, + "step": 3219 + }, + { + "epoch": 1.1467497773820126, + "grad_norm": 0.45587077736854553, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7733077108860016, + "num_tokens": 926169958.0, + "step": 3220 + }, + { + "epoch": 1.147105966162066, + "grad_norm": 0.46692970395088196, + "learning_rate": 1e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7622552216053009, + "num_tokens": 926464439.0, + "step": 3221 + }, + { + "epoch": 1.1474621549421193, + "grad_norm": 0.48914483189582825, + "learning_rate": 1e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7570356875658035, + "num_tokens": 926728193.0, + "step": 3222 + }, + { + "epoch": 1.1478183437221727, + "grad_norm": 0.49406132102012634, + "learning_rate": 1e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7645304501056671, + "num_tokens": 927019191.0, + "step": 3223 + }, + { + "epoch": 1.1481745325022261, + "grad_norm": 0.48769912123680115, + "learning_rate": 1e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7547774910926819, + "num_tokens": 927289064.0, + "step": 3224 + }, + { + "epoch": 1.1485307212822797, + "grad_norm": 0.4711148738861084, + "learning_rate": 1e-06, + "loss": 0.684, + "mean_token_accuracy": 0.7738222926855087, + "num_tokens": 927568454.0, + "step": 3225 + }, + { + "epoch": 1.1488869100623331, + "grad_norm": 0.46816524863243103, + "learning_rate": 1e-06, + "loss": 0.684, + "mean_token_accuracy": 0.7733250707387924, + "num_tokens": 927836167.0, + "step": 3226 + }, + { + "epoch": 1.1492430988423865, + "grad_norm": 0.4541383683681488, + "learning_rate": 1e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7702442854642868, + "num_tokens": 928120743.0, + "step": 3227 + }, + { + "epoch": 1.14959928762244, + "grad_norm": 0.521323025226593, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7620960772037506, + "num_tokens": 928395864.0, + "step": 3228 + }, + { + "epoch": 1.1499554764024933, + "grad_norm": 0.5048946738243103, + "learning_rate": 1e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.7498297989368439, + "num_tokens": 928644759.0, + "step": 3229 + }, + { + "epoch": 1.1503116651825467, + "grad_norm": 0.4685378968715668, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7326494604349136, + "num_tokens": 928938153.0, + "step": 3230 + }, + { + "epoch": 1.1506678539626, + "grad_norm": 0.5123341679573059, + "learning_rate": 1e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7655965983867645, + "num_tokens": 929218113.0, + "step": 3231 + }, + { + "epoch": 1.1510240427426537, + "grad_norm": 0.5103779435157776, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7709027975797653, + "num_tokens": 929482710.0, + "step": 3232 + }, + { + "epoch": 1.151380231522707, + "grad_norm": 0.5378074049949646, + "learning_rate": 1e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.761389747262001, + "num_tokens": 929735103.0, + "step": 3233 + }, + { + "epoch": 1.1517364203027605, + "grad_norm": 0.47866442799568176, + "learning_rate": 1e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7785331457853317, + "num_tokens": 930004117.0, + "step": 3234 + }, + { + "epoch": 1.1520926090828139, + "grad_norm": 0.4778697192668915, + "learning_rate": 1e-06, + "loss": 0.6756, + "mean_token_accuracy": 0.7804914861917496, + "num_tokens": 930268080.0, + "step": 3235 + }, + { + "epoch": 1.1524487978628672, + "grad_norm": 0.5285639762878418, + "learning_rate": 1e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7654454410076141, + "num_tokens": 930538828.0, + "step": 3236 + }, + { + "epoch": 1.1528049866429209, + "grad_norm": 0.5067878365516663, + "learning_rate": 1e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7594195604324341, + "num_tokens": 930833208.0, + "step": 3237 + }, + { + "epoch": 1.1531611754229742, + "grad_norm": 0.4629148840904236, + "learning_rate": 1e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7538661509752274, + "num_tokens": 931130328.0, + "step": 3238 + }, + { + "epoch": 1.1535173642030276, + "grad_norm": 0.49114078283309937, + "learning_rate": 1e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7642973065376282, + "num_tokens": 931407037.0, + "step": 3239 + }, + { + "epoch": 1.153873552983081, + "grad_norm": 0.5034752488136292, + "learning_rate": 1e-06, + "loss": 0.622, + "mean_token_accuracy": 0.7965415269136429, + "num_tokens": 931694583.0, + "step": 3240 + }, + { + "epoch": 1.1542297417631344, + "grad_norm": 0.518129289150238, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7591413855552673, + "num_tokens": 931987764.0, + "step": 3241 + }, + { + "epoch": 1.1545859305431878, + "grad_norm": 0.5020893812179565, + "learning_rate": 1e-06, + "loss": 0.707, + "mean_token_accuracy": 0.767491489648819, + "num_tokens": 932271341.0, + "step": 3242 + }, + { + "epoch": 1.1549421193232412, + "grad_norm": 0.4788290560245514, + "learning_rate": 1e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7775324285030365, + "num_tokens": 932572614.0, + "step": 3243 + }, + { + "epoch": 1.1552983081032948, + "grad_norm": 0.4565419852733612, + "learning_rate": 1e-06, + "loss": 0.678, + "mean_token_accuracy": 0.7806428223848343, + "num_tokens": 932876523.0, + "step": 3244 + }, + { + "epoch": 1.1556544968833482, + "grad_norm": 0.49844691157341003, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7626233845949173, + "num_tokens": 933134679.0, + "step": 3245 + }, + { + "epoch": 1.1560106856634016, + "grad_norm": 0.5103393197059631, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.760471448302269, + "num_tokens": 933405627.0, + "step": 3246 + }, + { + "epoch": 1.156366874443455, + "grad_norm": 0.47261637449264526, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7621477693319321, + "num_tokens": 933681774.0, + "step": 3247 + }, + { + "epoch": 1.1567230632235084, + "grad_norm": 0.45165157318115234, + "learning_rate": 1e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7437300235033035, + "num_tokens": 933964318.0, + "step": 3248 + }, + { + "epoch": 1.157079252003562, + "grad_norm": 0.509043276309967, + "learning_rate": 1e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7632238566875458, + "num_tokens": 934227899.0, + "step": 3249 + }, + { + "epoch": 1.1574354407836154, + "grad_norm": 0.5315507650375366, + "learning_rate": 1e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7658123970031738, + "num_tokens": 934489150.0, + "step": 3250 + }, + { + "epoch": 1.1577916295636688, + "grad_norm": 0.5020881295204163, + "learning_rate": 1e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.7526706904172897, + "num_tokens": 934735602.0, + "step": 3251 + }, + { + "epoch": 1.1581478183437222, + "grad_norm": 0.48805445432662964, + "learning_rate": 1e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.767566055059433, + "num_tokens": 935004390.0, + "step": 3252 + }, + { + "epoch": 1.1585040071237755, + "grad_norm": 0.5167570114135742, + "learning_rate": 1e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7683283686637878, + "num_tokens": 935261100.0, + "step": 3253 + }, + { + "epoch": 1.158860195903829, + "grad_norm": 0.4509066045284271, + "learning_rate": 1e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7672731280326843, + "num_tokens": 935577700.0, + "step": 3254 + }, + { + "epoch": 1.1592163846838826, + "grad_norm": 0.5065903663635254, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7541339993476868, + "num_tokens": 935838880.0, + "step": 3255 + }, + { + "epoch": 1.159572573463936, + "grad_norm": 0.5381780862808228, + "learning_rate": 1e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7649996876716614, + "num_tokens": 936116642.0, + "step": 3256 + }, + { + "epoch": 1.1599287622439893, + "grad_norm": 0.44945642352104187, + "learning_rate": 1e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7670581042766571, + "num_tokens": 936410050.0, + "step": 3257 + }, + { + "epoch": 1.1602849510240427, + "grad_norm": 0.47053125500679016, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7703406512737274, + "num_tokens": 936681899.0, + "step": 3258 + }, + { + "epoch": 1.1606411398040961, + "grad_norm": 0.4216308295726776, + "learning_rate": 1e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7753884643316269, + "num_tokens": 937000459.0, + "step": 3259 + }, + { + "epoch": 1.1609973285841497, + "grad_norm": 0.5021630525588989, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7517319470643997, + "num_tokens": 937290450.0, + "step": 3260 + }, + { + "epoch": 1.1613535173642031, + "grad_norm": 0.4766922891139984, + "learning_rate": 1e-06, + "loss": 0.6605, + "mean_token_accuracy": 0.7837875485420227, + "num_tokens": 937570156.0, + "step": 3261 + }, + { + "epoch": 1.1617097061442565, + "grad_norm": 0.4799405038356781, + "learning_rate": 1e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7543182075023651, + "num_tokens": 937844133.0, + "step": 3262 + }, + { + "epoch": 1.16206589492431, + "grad_norm": 0.4611118733882904, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.757355585694313, + "num_tokens": 938151439.0, + "step": 3263 + }, + { + "epoch": 1.1624220837043633, + "grad_norm": 0.46489155292510986, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7465025335550308, + "num_tokens": 938454385.0, + "step": 3264 + }, + { + "epoch": 1.1627782724844167, + "grad_norm": 0.5311597585678101, + "learning_rate": 1e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7683654129505157, + "num_tokens": 938754847.0, + "step": 3265 + }, + { + "epoch": 1.16313446126447, + "grad_norm": 0.5027720332145691, + "learning_rate": 1e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7497476935386658, + "num_tokens": 939005270.0, + "step": 3266 + }, + { + "epoch": 1.1634906500445237, + "grad_norm": 0.47079670429229736, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7536444664001465, + "num_tokens": 939271348.0, + "step": 3267 + }, + { + "epoch": 1.163846838824577, + "grad_norm": 0.4460940957069397, + "learning_rate": 1e-06, + "loss": 0.6705, + "mean_token_accuracy": 0.7794346660375595, + "num_tokens": 939590490.0, + "step": 3268 + }, + { + "epoch": 1.1642030276046305, + "grad_norm": 0.4315313398838043, + "learning_rate": 1e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7670494318008423, + "num_tokens": 939890000.0, + "step": 3269 + }, + { + "epoch": 1.1645592163846838, + "grad_norm": 0.4464688301086426, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7658250629901886, + "num_tokens": 940162818.0, + "step": 3270 + }, + { + "epoch": 1.1649154051647372, + "grad_norm": 0.4177749752998352, + "learning_rate": 1e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7791570872068405, + "num_tokens": 940500019.0, + "step": 3271 + }, + { + "epoch": 1.1652715939447909, + "grad_norm": 0.488023579120636, + "learning_rate": 1e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.773314580321312, + "num_tokens": 940788273.0, + "step": 3272 + }, + { + "epoch": 1.1656277827248442, + "grad_norm": 0.45386946201324463, + "learning_rate": 1e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.7812874168157578, + "num_tokens": 941100800.0, + "step": 3273 + }, + { + "epoch": 1.1659839715048976, + "grad_norm": 0.48951029777526855, + "learning_rate": 1e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.7585620433092117, + "num_tokens": 941394984.0, + "step": 3274 + }, + { + "epoch": 1.166340160284951, + "grad_norm": 0.4826955795288086, + "learning_rate": 1e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7710268646478653, + "num_tokens": 941676130.0, + "step": 3275 + }, + { + "epoch": 1.1666963490650044, + "grad_norm": 0.4993866980075836, + "learning_rate": 1e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7634797394275665, + "num_tokens": 941959086.0, + "step": 3276 + }, + { + "epoch": 1.1670525378450578, + "grad_norm": 0.4526526927947998, + "learning_rate": 1e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7647817730903625, + "num_tokens": 942257481.0, + "step": 3277 + }, + { + "epoch": 1.1674087266251112, + "grad_norm": 0.4723510146141052, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7646093517541885, + "num_tokens": 942562684.0, + "step": 3278 + }, + { + "epoch": 1.1677649154051648, + "grad_norm": 0.43131110072135925, + "learning_rate": 1e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7683639824390411, + "num_tokens": 942830238.0, + "step": 3279 + }, + { + "epoch": 1.1681211041852182, + "grad_norm": 0.4710744619369507, + "learning_rate": 1e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7560999542474747, + "num_tokens": 943127917.0, + "step": 3280 + }, + { + "epoch": 1.1684772929652716, + "grad_norm": 0.444011926651001, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7581478804349899, + "num_tokens": 943434195.0, + "step": 3281 + }, + { + "epoch": 1.168833481745325, + "grad_norm": 0.46330562233924866, + "learning_rate": 1e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7583888918161392, + "num_tokens": 943717893.0, + "step": 3282 + }, + { + "epoch": 1.1691896705253784, + "grad_norm": 0.4463711678981781, + "learning_rate": 1e-06, + "loss": 0.7498, + "mean_token_accuracy": 0.765858843922615, + "num_tokens": 944043647.0, + "step": 3283 + }, + { + "epoch": 1.169545859305432, + "grad_norm": 0.4633803069591522, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7693940252065659, + "num_tokens": 944309905.0, + "step": 3284 + }, + { + "epoch": 1.1699020480854854, + "grad_norm": 0.4863562285900116, + "learning_rate": 1e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7752313613891602, + "num_tokens": 944602178.0, + "step": 3285 + }, + { + "epoch": 1.1702582368655388, + "grad_norm": 0.49105018377304077, + "learning_rate": 1e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.768187865614891, + "num_tokens": 944902861.0, + "step": 3286 + }, + { + "epoch": 1.1706144256455921, + "grad_norm": 0.4873173236846924, + "learning_rate": 1e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7765636891126633, + "num_tokens": 945173921.0, + "step": 3287 + }, + { + "epoch": 1.1709706144256455, + "grad_norm": 0.5032939314842224, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.769246906042099, + "num_tokens": 945456772.0, + "step": 3288 + }, + { + "epoch": 1.171326803205699, + "grad_norm": 0.4836810827255249, + "learning_rate": 1e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7781181037425995, + "num_tokens": 945720785.0, + "step": 3289 + }, + { + "epoch": 1.1716829919857525, + "grad_norm": 0.4593159556388855, + "learning_rate": 1e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7741860002279282, + "num_tokens": 946024632.0, + "step": 3290 + }, + { + "epoch": 1.172039180765806, + "grad_norm": 0.46458348631858826, + "learning_rate": 1e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7803985029459, + "num_tokens": 946301071.0, + "step": 3291 + }, + { + "epoch": 1.1723953695458593, + "grad_norm": 0.45564308762550354, + "learning_rate": 1e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7669415175914764, + "num_tokens": 946589582.0, + "step": 3292 + }, + { + "epoch": 1.1727515583259127, + "grad_norm": 0.5215089917182922, + "learning_rate": 1e-06, + "loss": 0.7368, + "mean_token_accuracy": 0.7633843570947647, + "num_tokens": 946865696.0, + "step": 3293 + }, + { + "epoch": 1.173107747105966, + "grad_norm": 0.43575161695480347, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7482839375734329, + "num_tokens": 947167845.0, + "step": 3294 + }, + { + "epoch": 1.1734639358860195, + "grad_norm": 0.512240469455719, + "learning_rate": 1e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7591547220945358, + "num_tokens": 947478681.0, + "step": 3295 + }, + { + "epoch": 1.173820124666073, + "grad_norm": 0.4053812325000763, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7543838173151016, + "num_tokens": 947809528.0, + "step": 3296 + }, + { + "epoch": 1.1741763134461265, + "grad_norm": 0.47906550765037537, + "learning_rate": 1e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7726138085126877, + "num_tokens": 948071376.0, + "step": 3297 + }, + { + "epoch": 1.1745325022261799, + "grad_norm": 0.4667246639728546, + "learning_rate": 1e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.7571209371089935, + "num_tokens": 948356348.0, + "step": 3298 + }, + { + "epoch": 1.1748886910062333, + "grad_norm": 0.5178924798965454, + "learning_rate": 1e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7562551200389862, + "num_tokens": 948645155.0, + "step": 3299 + }, + { + "epoch": 1.1752448797862867, + "grad_norm": 0.5159236788749695, + "learning_rate": 1e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.7704707384109497, + "num_tokens": 948902941.0, + "step": 3300 + }, + { + "epoch": 1.17560106856634, + "grad_norm": 0.4812507927417755, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7527394890785217, + "num_tokens": 949190756.0, + "step": 3301 + }, + { + "epoch": 1.1759572573463937, + "grad_norm": 0.4487871527671814, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7552103251218796, + "num_tokens": 949481608.0, + "step": 3302 + }, + { + "epoch": 1.176313446126447, + "grad_norm": 0.477085679769516, + "learning_rate": 1e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7673138380050659, + "num_tokens": 949780661.0, + "step": 3303 + }, + { + "epoch": 1.1766696349065005, + "grad_norm": 0.4394684433937073, + "learning_rate": 1e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7571516931056976, + "num_tokens": 950076912.0, + "step": 3304 + }, + { + "epoch": 1.1770258236865538, + "grad_norm": 0.4978882968425751, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7664691060781479, + "num_tokens": 950361589.0, + "step": 3305 + }, + { + "epoch": 1.1773820124666072, + "grad_norm": 0.4836152493953705, + "learning_rate": 1e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7612566500902176, + "num_tokens": 950651419.0, + "step": 3306 + }, + { + "epoch": 1.1777382012466608, + "grad_norm": 0.49649885296821594, + "learning_rate": 1e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7596725672483444, + "num_tokens": 950908753.0, + "step": 3307 + }, + { + "epoch": 1.1780943900267142, + "grad_norm": 0.4454812705516815, + "learning_rate": 1e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.7541951239109039, + "num_tokens": 951238060.0, + "step": 3308 + }, + { + "epoch": 1.1784505788067676, + "grad_norm": 0.5113121271133423, + "learning_rate": 1e-06, + "loss": 0.8113, + "mean_token_accuracy": 0.7479295432567596, + "num_tokens": 951522106.0, + "step": 3309 + }, + { + "epoch": 1.178806767586821, + "grad_norm": 0.46560853719711304, + "learning_rate": 1e-06, + "loss": 0.7262, + "mean_token_accuracy": 0.7630558460950851, + "num_tokens": 951821032.0, + "step": 3310 + }, + { + "epoch": 1.1791629563668744, + "grad_norm": 0.5115997791290283, + "learning_rate": 1e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7507321834564209, + "num_tokens": 952094829.0, + "step": 3311 + }, + { + "epoch": 1.1795191451469278, + "grad_norm": 0.46200239658355713, + "learning_rate": 1e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7747584283351898, + "num_tokens": 952384351.0, + "step": 3312 + }, + { + "epoch": 1.1798753339269812, + "grad_norm": 0.48106372356414795, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7738424986600876, + "num_tokens": 952668461.0, + "step": 3313 + }, + { + "epoch": 1.1802315227070348, + "grad_norm": 0.5192652940750122, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7612358629703522, + "num_tokens": 952911893.0, + "step": 3314 + }, + { + "epoch": 1.1805877114870882, + "grad_norm": 0.46400073170661926, + "learning_rate": 1e-06, + "loss": 0.7, + "mean_token_accuracy": 0.7721622139215469, + "num_tokens": 953216582.0, + "step": 3315 + }, + { + "epoch": 1.1809439002671416, + "grad_norm": 0.46313443779945374, + "learning_rate": 1e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7425864636898041, + "num_tokens": 953513272.0, + "step": 3316 + }, + { + "epoch": 1.181300089047195, + "grad_norm": 0.47729960083961487, + "learning_rate": 1e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7539816945791245, + "num_tokens": 953804054.0, + "step": 3317 + }, + { + "epoch": 1.1816562778272484, + "grad_norm": 0.4931299388408661, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7611341029405594, + "num_tokens": 954057488.0, + "step": 3318 + }, + { + "epoch": 1.182012466607302, + "grad_norm": 0.500717043876648, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7731538861989975, + "num_tokens": 954327404.0, + "step": 3319 + }, + { + "epoch": 1.1823686553873554, + "grad_norm": 0.42606714367866516, + "learning_rate": 1e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7520419210195541, + "num_tokens": 954609395.0, + "step": 3320 + }, + { + "epoch": 1.1827248441674088, + "grad_norm": 0.4604932963848114, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7613820135593414, + "num_tokens": 954911399.0, + "step": 3321 + }, + { + "epoch": 1.1830810329474621, + "grad_norm": 0.4363405406475067, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7635630667209625, + "num_tokens": 955199342.0, + "step": 3322 + }, + { + "epoch": 1.1834372217275155, + "grad_norm": 0.4889676868915558, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.760472759604454, + "num_tokens": 955477346.0, + "step": 3323 + }, + { + "epoch": 1.183793410507569, + "grad_norm": 0.40986108779907227, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7730033844709396, + "num_tokens": 955833823.0, + "step": 3324 + }, + { + "epoch": 1.1841495992876225, + "grad_norm": 0.5623190402984619, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7404525727033615, + "num_tokens": 956070429.0, + "step": 3325 + }, + { + "epoch": 1.184505788067676, + "grad_norm": 0.5129731893539429, + "learning_rate": 1e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.744392991065979, + "num_tokens": 956350578.0, + "step": 3326 + }, + { + "epoch": 1.1848619768477293, + "grad_norm": 0.4606007933616638, + "learning_rate": 1e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7569826394319534, + "num_tokens": 956649406.0, + "step": 3327 + }, + { + "epoch": 1.1852181656277827, + "grad_norm": 0.48901137709617615, + "learning_rate": 1e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7767744809389114, + "num_tokens": 956954329.0, + "step": 3328 + }, + { + "epoch": 1.185574354407836, + "grad_norm": 0.5029784440994263, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7427161782979965, + "num_tokens": 957255419.0, + "step": 3329 + }, + { + "epoch": 1.1859305431878895, + "grad_norm": 0.4208320379257202, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7641977965831757, + "num_tokens": 957578682.0, + "step": 3330 + }, + { + "epoch": 1.186286731967943, + "grad_norm": 0.5596840977668762, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7427343130111694, + "num_tokens": 957816341.0, + "step": 3331 + }, + { + "epoch": 1.1866429207479965, + "grad_norm": 0.4101578891277313, + "learning_rate": 1e-06, + "loss": 0.6619, + "mean_token_accuracy": 0.7831944823265076, + "num_tokens": 958126286.0, + "step": 3332 + }, + { + "epoch": 1.1869991095280499, + "grad_norm": 0.42248019576072693, + "learning_rate": 1e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.7692504376173019, + "num_tokens": 958451146.0, + "step": 3333 + }, + { + "epoch": 1.1873552983081033, + "grad_norm": 0.47394609451293945, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.762283205986023, + "num_tokens": 958738078.0, + "step": 3334 + }, + { + "epoch": 1.1877114870881567, + "grad_norm": 0.4319148361682892, + "learning_rate": 1e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7683437913656235, + "num_tokens": 959052837.0, + "step": 3335 + }, + { + "epoch": 1.18806767586821, + "grad_norm": 0.4218701124191284, + "learning_rate": 1e-06, + "loss": 0.6636, + "mean_token_accuracy": 0.7842628955841064, + "num_tokens": 959400339.0, + "step": 3336 + }, + { + "epoch": 1.1884238646482637, + "grad_norm": 0.45826202630996704, + "learning_rate": 1e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7718265354633331, + "num_tokens": 959707406.0, + "step": 3337 + }, + { + "epoch": 1.188780053428317, + "grad_norm": 0.4925366938114166, + "learning_rate": 1e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.747951403260231, + "num_tokens": 960007192.0, + "step": 3338 + }, + { + "epoch": 1.1891362422083704, + "grad_norm": 0.43687835335731506, + "learning_rate": 1e-06, + "loss": 0.6369, + "mean_token_accuracy": 0.7931365519762039, + "num_tokens": 960304042.0, + "step": 3339 + }, + { + "epoch": 1.1894924309884238, + "grad_norm": 0.46310320496559143, + "learning_rate": 1e-06, + "loss": 0.6721, + "mean_token_accuracy": 0.7807324379682541, + "num_tokens": 960597929.0, + "step": 3340 + }, + { + "epoch": 1.1898486197684772, + "grad_norm": 0.49192291498184204, + "learning_rate": 1e-06, + "loss": 0.693, + "mean_token_accuracy": 0.777772918343544, + "num_tokens": 960897084.0, + "step": 3341 + }, + { + "epoch": 1.1902048085485308, + "grad_norm": 0.5372546911239624, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7680357992649078, + "num_tokens": 961164086.0, + "step": 3342 + }, + { + "epoch": 1.1905609973285842, + "grad_norm": 0.4839804768562317, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7667095363140106, + "num_tokens": 961449112.0, + "step": 3343 + }, + { + "epoch": 1.1909171861086376, + "grad_norm": 0.48591843247413635, + "learning_rate": 1e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7536584287881851, + "num_tokens": 961727701.0, + "step": 3344 + }, + { + "epoch": 1.191273374888691, + "grad_norm": 0.5060969591140747, + "learning_rate": 1e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7760605067014694, + "num_tokens": 961988595.0, + "step": 3345 + }, + { + "epoch": 1.1916295636687444, + "grad_norm": 0.4580482542514801, + "learning_rate": 1e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7775477468967438, + "num_tokens": 962306341.0, + "step": 3346 + }, + { + "epoch": 1.1919857524487978, + "grad_norm": 0.5033878087997437, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.751137375831604, + "num_tokens": 962586449.0, + "step": 3347 + }, + { + "epoch": 1.1923419412288512, + "grad_norm": 0.47196996212005615, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7693209648132324, + "num_tokens": 962891281.0, + "step": 3348 + }, + { + "epoch": 1.1926981300089048, + "grad_norm": 0.45052531361579895, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7811135798692703, + "num_tokens": 963177330.0, + "step": 3349 + }, + { + "epoch": 1.1930543187889582, + "grad_norm": 0.45881593227386475, + "learning_rate": 1e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.755794107913971, + "num_tokens": 963446343.0, + "step": 3350 + }, + { + "epoch": 1.1934105075690116, + "grad_norm": 0.5001793503761292, + "learning_rate": 1e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7657973319292068, + "num_tokens": 963739100.0, + "step": 3351 + }, + { + "epoch": 1.193766696349065, + "grad_norm": 0.5228503346443176, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7610057145357132, + "num_tokens": 963968845.0, + "step": 3352 + }, + { + "epoch": 1.1941228851291183, + "grad_norm": 0.4812319874763489, + "learning_rate": 1e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7701807022094727, + "num_tokens": 964241960.0, + "step": 3353 + }, + { + "epoch": 1.194479073909172, + "grad_norm": 0.48769256472587585, + "learning_rate": 1e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7731954008340836, + "num_tokens": 964491574.0, + "step": 3354 + }, + { + "epoch": 1.1948352626892254, + "grad_norm": 0.45226725935935974, + "learning_rate": 1e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.765157088637352, + "num_tokens": 964751561.0, + "step": 3355 + }, + { + "epoch": 1.1951914514692787, + "grad_norm": 0.4953950047492981, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7657220214605331, + "num_tokens": 965039684.0, + "step": 3356 + }, + { + "epoch": 1.1955476402493321, + "grad_norm": 0.4942854344844818, + "learning_rate": 1e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7736958712339401, + "num_tokens": 965347271.0, + "step": 3357 + }, + { + "epoch": 1.1959038290293855, + "grad_norm": 0.4424179494380951, + "learning_rate": 1e-06, + "loss": 0.6672, + "mean_token_accuracy": 0.7862219214439392, + "num_tokens": 965662349.0, + "step": 3358 + }, + { + "epoch": 1.196260017809439, + "grad_norm": 0.42601636052131653, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7661962509155273, + "num_tokens": 965973984.0, + "step": 3359 + }, + { + "epoch": 1.1966162065894925, + "grad_norm": 0.5198255777359009, + "learning_rate": 1e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7596662491559982, + "num_tokens": 966245271.0, + "step": 3360 + }, + { + "epoch": 1.196972395369546, + "grad_norm": 0.4894694983959198, + "learning_rate": 1e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.778289332985878, + "num_tokens": 966541417.0, + "step": 3361 + }, + { + "epoch": 1.1973285841495993, + "grad_norm": 0.4878772497177124, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7626387625932693, + "num_tokens": 966843026.0, + "step": 3362 + }, + { + "epoch": 1.1976847729296527, + "grad_norm": 0.4626888930797577, + "learning_rate": 1e-06, + "loss": 0.6589, + "mean_token_accuracy": 0.7846333086490631, + "num_tokens": 967149586.0, + "step": 3363 + }, + { + "epoch": 1.198040961709706, + "grad_norm": 0.4547319710254669, + "learning_rate": 1e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7498854994773865, + "num_tokens": 967446317.0, + "step": 3364 + }, + { + "epoch": 1.1983971504897595, + "grad_norm": 0.4918306767940521, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7605645656585693, + "num_tokens": 967740691.0, + "step": 3365 + }, + { + "epoch": 1.198753339269813, + "grad_norm": 0.479667067527771, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7451979368925095, + "num_tokens": 968040899.0, + "step": 3366 + }, + { + "epoch": 1.1991095280498665, + "grad_norm": 0.5213940143585205, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7427915781736374, + "num_tokens": 968290141.0, + "step": 3367 + }, + { + "epoch": 1.1994657168299199, + "grad_norm": 0.577700138092041, + "learning_rate": 1e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7621737420558929, + "num_tokens": 968549780.0, + "step": 3368 + }, + { + "epoch": 1.1998219056099733, + "grad_norm": 0.5412237644195557, + "learning_rate": 1e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.7583513110876083, + "num_tokens": 968821894.0, + "step": 3369 + }, + { + "epoch": 1.2001780943900267, + "grad_norm": 0.46260616183280945, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.772611528635025, + "num_tokens": 969093854.0, + "step": 3370 + }, + { + "epoch": 1.20053428317008, + "grad_norm": 0.5116922855377197, + "learning_rate": 1e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.7786845862865448, + "num_tokens": 969391332.0, + "step": 3371 + }, + { + "epoch": 1.2008904719501337, + "grad_norm": 0.5213092565536499, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7461398392915726, + "num_tokens": 969684209.0, + "step": 3372 + }, + { + "epoch": 1.201246660730187, + "grad_norm": 0.44430750608444214, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7644157856702805, + "num_tokens": 969977299.0, + "step": 3373 + }, + { + "epoch": 1.2016028495102404, + "grad_norm": 0.48430725932121277, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7636374831199646, + "num_tokens": 970276004.0, + "step": 3374 + }, + { + "epoch": 1.2019590382902938, + "grad_norm": 0.554911732673645, + "learning_rate": 1e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.7732786536216736, + "num_tokens": 970563450.0, + "step": 3375 + }, + { + "epoch": 1.2023152270703472, + "grad_norm": 0.4759424328804016, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7686376720666885, + "num_tokens": 970863577.0, + "step": 3376 + }, + { + "epoch": 1.2026714158504008, + "grad_norm": 0.4868471622467041, + "learning_rate": 1e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7669153213500977, + "num_tokens": 971145021.0, + "step": 3377 + }, + { + "epoch": 1.2030276046304542, + "grad_norm": 0.44932428002357483, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7614058256149292, + "num_tokens": 971480918.0, + "step": 3378 + }, + { + "epoch": 1.2033837934105076, + "grad_norm": 0.4740165174007416, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7675842791795731, + "num_tokens": 971778882.0, + "step": 3379 + }, + { + "epoch": 1.203739982190561, + "grad_norm": 0.5153330564498901, + "learning_rate": 1e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.758986085653305, + "num_tokens": 972066553.0, + "step": 3380 + }, + { + "epoch": 1.2040961709706144, + "grad_norm": 0.5739939212799072, + "learning_rate": 1e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7735912799835205, + "num_tokens": 972326848.0, + "step": 3381 + }, + { + "epoch": 1.2044523597506678, + "grad_norm": 0.5054826140403748, + "learning_rate": 1e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7658246606588364, + "num_tokens": 972620563.0, + "step": 3382 + }, + { + "epoch": 1.2048085485307212, + "grad_norm": 0.4436478614807129, + "learning_rate": 1e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7581405341625214, + "num_tokens": 972921611.0, + "step": 3383 + }, + { + "epoch": 1.2051647373107748, + "grad_norm": 0.4570413827896118, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7438570559024811, + "num_tokens": 973172104.0, + "step": 3384 + }, + { + "epoch": 1.2055209260908282, + "grad_norm": 0.4618856608867645, + "learning_rate": 1e-06, + "loss": 0.7645, + "mean_token_accuracy": 0.7593649625778198, + "num_tokens": 973453312.0, + "step": 3385 + }, + { + "epoch": 1.2058771148708816, + "grad_norm": 0.46314260363578796, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7624400407075882, + "num_tokens": 973757315.0, + "step": 3386 + }, + { + "epoch": 1.206233303650935, + "grad_norm": 0.42795079946517944, + "learning_rate": 1e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7426390051841736, + "num_tokens": 974078459.0, + "step": 3387 + }, + { + "epoch": 1.2065894924309883, + "grad_norm": 0.5211626291275024, + "learning_rate": 1e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7616118937730789, + "num_tokens": 974372693.0, + "step": 3388 + }, + { + "epoch": 1.206945681211042, + "grad_norm": 0.49125567078590393, + "learning_rate": 1e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.7772888094186783, + "num_tokens": 974639518.0, + "step": 3389 + }, + { + "epoch": 1.2073018699910953, + "grad_norm": 0.48000872135162354, + "learning_rate": 1e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7601348161697388, + "num_tokens": 974938484.0, + "step": 3390 + }, + { + "epoch": 1.2076580587711487, + "grad_norm": 0.4746055603027344, + "learning_rate": 1e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7761966288089752, + "num_tokens": 975216310.0, + "step": 3391 + }, + { + "epoch": 1.2080142475512021, + "grad_norm": 0.5137128233909607, + "learning_rate": 1e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7668049782514572, + "num_tokens": 975495243.0, + "step": 3392 + }, + { + "epoch": 1.2083704363312555, + "grad_norm": 0.512762188911438, + "learning_rate": 1e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7825025469064713, + "num_tokens": 975761616.0, + "step": 3393 + }, + { + "epoch": 1.208726625111309, + "grad_norm": 0.5070603489875793, + "learning_rate": 1e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7652571350336075, + "num_tokens": 976028944.0, + "step": 3394 + }, + { + "epoch": 1.2090828138913625, + "grad_norm": 0.4222661554813385, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7611879110336304, + "num_tokens": 976321047.0, + "step": 3395 + }, + { + "epoch": 1.209439002671416, + "grad_norm": 0.48665082454681396, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7667573541402817, + "num_tokens": 976617158.0, + "step": 3396 + }, + { + "epoch": 1.2097951914514693, + "grad_norm": 0.47272711992263794, + "learning_rate": 1e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7658065855503082, + "num_tokens": 976902219.0, + "step": 3397 + }, + { + "epoch": 1.2101513802315227, + "grad_norm": 0.482931524515152, + "learning_rate": 1e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7529700845479965, + "num_tokens": 977181345.0, + "step": 3398 + }, + { + "epoch": 1.210507569011576, + "grad_norm": 0.5220736265182495, + "learning_rate": 1e-06, + "loss": 0.756, + "mean_token_accuracy": 0.760710597038269, + "num_tokens": 977448098.0, + "step": 3399 + }, + { + "epoch": 1.2108637577916295, + "grad_norm": 0.5077282190322876, + "learning_rate": 1e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7562188655138016, + "num_tokens": 977724875.0, + "step": 3400 + }, + { + "epoch": 1.211219946571683, + "grad_norm": 0.4713895320892334, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7441191077232361, + "num_tokens": 978038815.0, + "step": 3401 + }, + { + "epoch": 1.2115761353517365, + "grad_norm": 0.49432480335235596, + "learning_rate": 1e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.7825824171304703, + "num_tokens": 978333975.0, + "step": 3402 + }, + { + "epoch": 1.2119323241317899, + "grad_norm": 0.5474979281425476, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7543317377567291, + "num_tokens": 978590086.0, + "step": 3403 + }, + { + "epoch": 1.2122885129118433, + "grad_norm": 0.4954598546028137, + "learning_rate": 1e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7730524837970734, + "num_tokens": 978877985.0, + "step": 3404 + }, + { + "epoch": 1.2126447016918966, + "grad_norm": 0.47855785489082336, + "learning_rate": 1e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7779289782047272, + "num_tokens": 979169092.0, + "step": 3405 + }, + { + "epoch": 1.21300089047195, + "grad_norm": 0.4857344925403595, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7602219879627228, + "num_tokens": 979459897.0, + "step": 3406 + }, + { + "epoch": 1.2133570792520036, + "grad_norm": 0.45595583319664, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7498735934495926, + "num_tokens": 979752062.0, + "step": 3407 + }, + { + "epoch": 1.213713268032057, + "grad_norm": 0.5131362676620483, + "learning_rate": 1e-06, + "loss": 0.7498, + "mean_token_accuracy": 0.7626455426216125, + "num_tokens": 980004009.0, + "step": 3408 + }, + { + "epoch": 1.2140694568121104, + "grad_norm": 0.4947492778301239, + "learning_rate": 1e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7778419703245163, + "num_tokens": 980255651.0, + "step": 3409 + }, + { + "epoch": 1.2144256455921638, + "grad_norm": 0.4815713167190552, + "learning_rate": 1e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7689475566148758, + "num_tokens": 980551606.0, + "step": 3410 + }, + { + "epoch": 1.2147818343722172, + "grad_norm": 0.43520164489746094, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7682501077651978, + "num_tokens": 980876407.0, + "step": 3411 + }, + { + "epoch": 1.2151380231522708, + "grad_norm": 0.4555973410606384, + "learning_rate": 1e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7506759613752365, + "num_tokens": 981149924.0, + "step": 3412 + }, + { + "epoch": 1.2154942119323242, + "grad_norm": 0.4675728380680084, + "learning_rate": 1e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7708857208490372, + "num_tokens": 981438730.0, + "step": 3413 + }, + { + "epoch": 1.2158504007123776, + "grad_norm": 0.4906216561794281, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7588112056255341, + "num_tokens": 981721633.0, + "step": 3414 + }, + { + "epoch": 1.216206589492431, + "grad_norm": 0.4205019176006317, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7708867937326431, + "num_tokens": 982019697.0, + "step": 3415 + }, + { + "epoch": 1.2165627782724844, + "grad_norm": 0.4879659116268158, + "learning_rate": 1e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7695546299219131, + "num_tokens": 982296256.0, + "step": 3416 + }, + { + "epoch": 1.2169189670525378, + "grad_norm": 0.5222040414810181, + "learning_rate": 1e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7715698331594467, + "num_tokens": 982572611.0, + "step": 3417 + }, + { + "epoch": 1.2172751558325912, + "grad_norm": 0.47648072242736816, + "learning_rate": 1e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7608013451099396, + "num_tokens": 982854181.0, + "step": 3418 + }, + { + "epoch": 1.2176313446126448, + "grad_norm": 0.45921412110328674, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7501471638679504, + "num_tokens": 983157451.0, + "step": 3419 + }, + { + "epoch": 1.2179875333926982, + "grad_norm": 0.49584370851516724, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7450685948133469, + "num_tokens": 983444112.0, + "step": 3420 + }, + { + "epoch": 1.2183437221727516, + "grad_norm": 0.5329205393791199, + "learning_rate": 1e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7655028402805328, + "num_tokens": 983693479.0, + "step": 3421 + }, + { + "epoch": 1.218699910952805, + "grad_norm": 0.46884098649024963, + "learning_rate": 1e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.7625435888767242, + "num_tokens": 983978159.0, + "step": 3422 + }, + { + "epoch": 1.2190560997328583, + "grad_norm": 0.4828549027442932, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7500600665807724, + "num_tokens": 984235480.0, + "step": 3423 + }, + { + "epoch": 1.219412288512912, + "grad_norm": 0.44913196563720703, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7743013203144073, + "num_tokens": 984563846.0, + "step": 3424 + }, + { + "epoch": 1.2197684772929653, + "grad_norm": 0.45340490341186523, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7565374821424484, + "num_tokens": 984878821.0, + "step": 3425 + }, + { + "epoch": 1.2201246660730187, + "grad_norm": 0.45777469873428345, + "learning_rate": 1e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.768537312746048, + "num_tokens": 985151264.0, + "step": 3426 + }, + { + "epoch": 1.2204808548530721, + "grad_norm": 0.5000869631767273, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7700274586677551, + "num_tokens": 985421533.0, + "step": 3427 + }, + { + "epoch": 1.2208370436331255, + "grad_norm": 0.4554937779903412, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7704908102750778, + "num_tokens": 985771733.0, + "step": 3428 + }, + { + "epoch": 1.221193232413179, + "grad_norm": 0.4970173239707947, + "learning_rate": 1e-06, + "loss": 0.6825, + "mean_token_accuracy": 0.7788828313350677, + "num_tokens": 986049485.0, + "step": 3429 + }, + { + "epoch": 1.2215494211932325, + "grad_norm": 0.5314491391181946, + "learning_rate": 1e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7630954682826996, + "num_tokens": 986280397.0, + "step": 3430 + }, + { + "epoch": 1.221905609973286, + "grad_norm": 0.4730530083179474, + "learning_rate": 1e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7644700407981873, + "num_tokens": 986547534.0, + "step": 3431 + }, + { + "epoch": 1.2222617987533393, + "grad_norm": 0.49340760707855225, + "learning_rate": 1e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7657118439674377, + "num_tokens": 986839514.0, + "step": 3432 + }, + { + "epoch": 1.2226179875333927, + "grad_norm": 0.4657140076160431, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.759264811873436, + "num_tokens": 987118543.0, + "step": 3433 + }, + { + "epoch": 1.222974176313446, + "grad_norm": 0.5023773908615112, + "learning_rate": 1e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7697533667087555, + "num_tokens": 987384793.0, + "step": 3434 + }, + { + "epoch": 1.2233303650934995, + "grad_norm": 0.4912322163581848, + "learning_rate": 1e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7503153681755066, + "num_tokens": 987668313.0, + "step": 3435 + }, + { + "epoch": 1.223686553873553, + "grad_norm": 0.4320482313632965, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7644876837730408, + "num_tokens": 987998760.0, + "step": 3436 + }, + { + "epoch": 1.2240427426536065, + "grad_norm": 0.4710962772369385, + "learning_rate": 1e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.752508282661438, + "num_tokens": 988296191.0, + "step": 3437 + }, + { + "epoch": 1.2243989314336599, + "grad_norm": 0.44079461693763733, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7617183178663254, + "num_tokens": 988586581.0, + "step": 3438 + }, + { + "epoch": 1.2247551202137132, + "grad_norm": 0.475272536277771, + "learning_rate": 1e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7568731009960175, + "num_tokens": 988885795.0, + "step": 3439 + }, + { + "epoch": 1.2251113089937666, + "grad_norm": 0.45945677161216736, + "learning_rate": 1e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7764138132333755, + "num_tokens": 989195276.0, + "step": 3440 + }, + { + "epoch": 1.22546749777382, + "grad_norm": 0.4531209468841553, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7639764249324799, + "num_tokens": 989495484.0, + "step": 3441 + }, + { + "epoch": 1.2258236865538736, + "grad_norm": 0.5059521198272705, + "learning_rate": 1e-06, + "loss": 0.6545, + "mean_token_accuracy": 0.7830522358417511, + "num_tokens": 989760095.0, + "step": 3442 + }, + { + "epoch": 1.226179875333927, + "grad_norm": 0.4849840998649597, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7522769719362259, + "num_tokens": 990033799.0, + "step": 3443 + }, + { + "epoch": 1.2265360641139804, + "grad_norm": 0.43225687742233276, + "learning_rate": 1e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7740731835365295, + "num_tokens": 990359565.0, + "step": 3444 + }, + { + "epoch": 1.2268922528940338, + "grad_norm": 0.5151911377906799, + "learning_rate": 1e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.754563719034195, + "num_tokens": 990647746.0, + "step": 3445 + }, + { + "epoch": 1.2272484416740872, + "grad_norm": 0.4686726927757263, + "learning_rate": 1e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7631732225418091, + "num_tokens": 990926832.0, + "step": 3446 + }, + { + "epoch": 1.2276046304541408, + "grad_norm": 0.47237205505371094, + "learning_rate": 1e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7741805762052536, + "num_tokens": 991190761.0, + "step": 3447 + }, + { + "epoch": 1.2279608192341942, + "grad_norm": 0.5178698301315308, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7625367343425751, + "num_tokens": 991475972.0, + "step": 3448 + }, + { + "epoch": 1.2283170080142476, + "grad_norm": 0.5009039044380188, + "learning_rate": 1e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7508953511714935, + "num_tokens": 991744715.0, + "step": 3449 + }, + { + "epoch": 1.228673196794301, + "grad_norm": 0.47544825077056885, + "learning_rate": 1e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.764723151922226, + "num_tokens": 991998858.0, + "step": 3450 + }, + { + "epoch": 1.2290293855743544, + "grad_norm": 0.46613219380378723, + "learning_rate": 1e-06, + "loss": 0.7582, + "mean_token_accuracy": 0.7637366056442261, + "num_tokens": 992285886.0, + "step": 3451 + }, + { + "epoch": 1.2293855743544078, + "grad_norm": 0.45227065682411194, + "learning_rate": 1e-06, + "loss": 0.7066, + "mean_token_accuracy": 0.7698342800140381, + "num_tokens": 992574279.0, + "step": 3452 + }, + { + "epoch": 1.2297417631344612, + "grad_norm": 0.5036988258361816, + "learning_rate": 1e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7710603177547455, + "num_tokens": 992879154.0, + "step": 3453 + }, + { + "epoch": 1.2300979519145148, + "grad_norm": 0.4890623390674591, + "learning_rate": 1e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7528882920742035, + "num_tokens": 993161544.0, + "step": 3454 + }, + { + "epoch": 1.2304541406945682, + "grad_norm": 0.46380817890167236, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7712195068597794, + "num_tokens": 993480354.0, + "step": 3455 + }, + { + "epoch": 1.2308103294746215, + "grad_norm": 0.4630189836025238, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.751652866601944, + "num_tokens": 993771461.0, + "step": 3456 + }, + { + "epoch": 1.231166518254675, + "grad_norm": 0.461286723613739, + "learning_rate": 1e-06, + "loss": 0.6582, + "mean_token_accuracy": 0.785789281129837, + "num_tokens": 994084684.0, + "step": 3457 + }, + { + "epoch": 1.2315227070347283, + "grad_norm": 0.5186125040054321, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7415924072265625, + "num_tokens": 994369594.0, + "step": 3458 + }, + { + "epoch": 1.231878895814782, + "grad_norm": 0.5188738703727722, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7531268745660782, + "num_tokens": 994600793.0, + "step": 3459 + }, + { + "epoch": 1.2322350845948353, + "grad_norm": 0.4313422739505768, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7689663618803024, + "num_tokens": 994924572.0, + "step": 3460 + }, + { + "epoch": 1.2325912733748887, + "grad_norm": 0.4880426526069641, + "learning_rate": 1e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7712698727846146, + "num_tokens": 995184769.0, + "step": 3461 + }, + { + "epoch": 1.232947462154942, + "grad_norm": 0.43852367997169495, + "learning_rate": 1e-06, + "loss": 0.6587, + "mean_token_accuracy": 0.7881899029016495, + "num_tokens": 995483422.0, + "step": 3462 + }, + { + "epoch": 1.2333036509349955, + "grad_norm": 0.48466965556144714, + "learning_rate": 1e-06, + "loss": 0.6706, + "mean_token_accuracy": 0.785311758518219, + "num_tokens": 995802329.0, + "step": 3463 + }, + { + "epoch": 1.233659839715049, + "grad_norm": 0.4867824912071228, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7611294090747833, + "num_tokens": 996097252.0, + "step": 3464 + }, + { + "epoch": 1.2340160284951023, + "grad_norm": 0.4588238000869751, + "learning_rate": 1e-06, + "loss": 0.679, + "mean_token_accuracy": 0.7798283398151398, + "num_tokens": 996386155.0, + "step": 3465 + }, + { + "epoch": 1.234372217275156, + "grad_norm": 0.48991453647613525, + "learning_rate": 1e-06, + "loss": 0.6465, + "mean_token_accuracy": 0.792823925614357, + "num_tokens": 996697072.0, + "step": 3466 + }, + { + "epoch": 1.2347284060552093, + "grad_norm": 0.4554866850376129, + "learning_rate": 1e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7777584195137024, + "num_tokens": 996989611.0, + "step": 3467 + }, + { + "epoch": 1.2350845948352627, + "grad_norm": 0.5091556310653687, + "learning_rate": 1e-06, + "loss": 0.6463, + "mean_token_accuracy": 0.7875558584928513, + "num_tokens": 997275360.0, + "step": 3468 + }, + { + "epoch": 1.235440783615316, + "grad_norm": 0.4691392779350281, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7596148401498795, + "num_tokens": 997597150.0, + "step": 3469 + }, + { + "epoch": 1.2357969723953695, + "grad_norm": 0.5013836622238159, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7743657380342484, + "num_tokens": 997873482.0, + "step": 3470 + }, + { + "epoch": 1.236153161175423, + "grad_norm": 0.4938734173774719, + "learning_rate": 1e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7572360932826996, + "num_tokens": 998164067.0, + "step": 3471 + }, + { + "epoch": 1.2365093499554765, + "grad_norm": 0.4696088433265686, + "learning_rate": 1e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7794754803180695, + "num_tokens": 998469403.0, + "step": 3472 + }, + { + "epoch": 1.2368655387355298, + "grad_norm": 0.47132694721221924, + "learning_rate": 1e-06, + "loss": 0.6589, + "mean_token_accuracy": 0.7821063995361328, + "num_tokens": 998759929.0, + "step": 3473 + }, + { + "epoch": 1.2372217275155832, + "grad_norm": 0.44304975867271423, + "learning_rate": 1e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.764348641037941, + "num_tokens": 999038743.0, + "step": 3474 + }, + { + "epoch": 1.2375779162956366, + "grad_norm": 0.464814692735672, + "learning_rate": 1e-06, + "loss": 0.7079, + "mean_token_accuracy": 0.7698058187961578, + "num_tokens": 999365856.0, + "step": 3475 + }, + { + "epoch": 1.23793410507569, + "grad_norm": 0.49954766035079956, + "learning_rate": 1e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.7810889482498169, + "num_tokens": 999643245.0, + "step": 3476 + }, + { + "epoch": 1.2382902938557436, + "grad_norm": 0.43511825799942017, + "learning_rate": 1e-06, + "loss": 0.7952, + "mean_token_accuracy": 0.7517727017402649, + "num_tokens": 999968255.0, + "step": 3477 + }, + { + "epoch": 1.238646482635797, + "grad_norm": 0.507865846157074, + "learning_rate": 1e-06, + "loss": 0.6552, + "mean_token_accuracy": 0.7865295708179474, + "num_tokens": 1000242513.0, + "step": 3478 + }, + { + "epoch": 1.2390026714158504, + "grad_norm": 0.4959440231323242, + "learning_rate": 1e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7640745490789413, + "num_tokens": 1000548479.0, + "step": 3479 + }, + { + "epoch": 1.2393588601959038, + "grad_norm": 0.47619152069091797, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7660731673240662, + "num_tokens": 1000848467.0, + "step": 3480 + }, + { + "epoch": 1.2397150489759572, + "grad_norm": 0.48904451727867126, + "learning_rate": 1e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7676899284124374, + "num_tokens": 1001120769.0, + "step": 3481 + }, + { + "epoch": 1.2400712377560108, + "grad_norm": 0.4775557816028595, + "learning_rate": 1e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.7836542278528214, + "num_tokens": 1001395217.0, + "step": 3482 + }, + { + "epoch": 1.2404274265360642, + "grad_norm": 0.513003945350647, + "learning_rate": 1e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7621307224035263, + "num_tokens": 1001689374.0, + "step": 3483 + }, + { + "epoch": 1.2407836153161176, + "grad_norm": 0.5096443295478821, + "learning_rate": 1e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7595735341310501, + "num_tokens": 1001967364.0, + "step": 3484 + }, + { + "epoch": 1.241139804096171, + "grad_norm": 0.4309687316417694, + "learning_rate": 1e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.758495420217514, + "num_tokens": 1002295868.0, + "step": 3485 + }, + { + "epoch": 1.2414959928762244, + "grad_norm": 0.4744729697704315, + "learning_rate": 1e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7693098038434982, + "num_tokens": 1002574129.0, + "step": 3486 + }, + { + "epoch": 1.2418521816562778, + "grad_norm": 0.46319785714149475, + "learning_rate": 1e-06, + "loss": 0.6727, + "mean_token_accuracy": 0.7819841951131821, + "num_tokens": 1002848538.0, + "step": 3487 + }, + { + "epoch": 1.2422083704363311, + "grad_norm": 0.5163546800613403, + "learning_rate": 1e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7698308527469635, + "num_tokens": 1003148446.0, + "step": 3488 + }, + { + "epoch": 1.2425645592163848, + "grad_norm": 0.48613008856773376, + "learning_rate": 1e-06, + "loss": 0.6828, + "mean_token_accuracy": 0.7779054194688797, + "num_tokens": 1003418896.0, + "step": 3489 + }, + { + "epoch": 1.2429207479964381, + "grad_norm": 0.4867829382419586, + "learning_rate": 1e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7548307180404663, + "num_tokens": 1003687109.0, + "step": 3490 + }, + { + "epoch": 1.2432769367764915, + "grad_norm": 0.45134449005126953, + "learning_rate": 1e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7651066482067108, + "num_tokens": 1004007708.0, + "step": 3491 + }, + { + "epoch": 1.243633125556545, + "grad_norm": 0.5049271583557129, + "learning_rate": 1e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7661766111850739, + "num_tokens": 1004288105.0, + "step": 3492 + }, + { + "epoch": 1.2439893143365983, + "grad_norm": 0.49824127554893494, + "learning_rate": 1e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7639829963445663, + "num_tokens": 1004529821.0, + "step": 3493 + }, + { + "epoch": 1.244345503116652, + "grad_norm": 0.4635251462459564, + "learning_rate": 1e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7639932334423065, + "num_tokens": 1004824826.0, + "step": 3494 + }, + { + "epoch": 1.2447016918967053, + "grad_norm": 0.4580394923686981, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7475364953279495, + "num_tokens": 1005114464.0, + "step": 3495 + }, + { + "epoch": 1.2450578806767587, + "grad_norm": 0.4807792901992798, + "learning_rate": 1e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7749013155698776, + "num_tokens": 1005418455.0, + "step": 3496 + }, + { + "epoch": 1.245414069456812, + "grad_norm": 0.4231571853160858, + "learning_rate": 1e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.782385528087616, + "num_tokens": 1005721482.0, + "step": 3497 + }, + { + "epoch": 1.2457702582368655, + "grad_norm": 0.5063610076904297, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7531346082687378, + "num_tokens": 1005940582.0, + "step": 3498 + }, + { + "epoch": 1.2461264470169189, + "grad_norm": 0.5019506216049194, + "learning_rate": 1e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7572242468595505, + "num_tokens": 1006228138.0, + "step": 3499 + }, + { + "epoch": 1.2464826357969723, + "grad_norm": 0.4455752968788147, + "learning_rate": 1e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7738469392061234, + "num_tokens": 1006554687.0, + "step": 3500 + }, + { + "epoch": 1.2468388245770259, + "grad_norm": 0.45826777815818787, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7716533541679382, + "num_tokens": 1006863406.0, + "step": 3501 + }, + { + "epoch": 1.2471950133570793, + "grad_norm": 0.43991753458976746, + "learning_rate": 1e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7661381959915161, + "num_tokens": 1007138937.0, + "step": 3502 + }, + { + "epoch": 1.2475512021371327, + "grad_norm": 0.48968973755836487, + "learning_rate": 1e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7578066140413284, + "num_tokens": 1007458038.0, + "step": 3503 + }, + { + "epoch": 1.247907390917186, + "grad_norm": 0.47254103422164917, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7724457830190659, + "num_tokens": 1007756522.0, + "step": 3504 + }, + { + "epoch": 1.2482635796972394, + "grad_norm": 0.45396679639816284, + "learning_rate": 1e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7494983226060867, + "num_tokens": 1008043531.0, + "step": 3505 + }, + { + "epoch": 1.248619768477293, + "grad_norm": 0.4717479348182678, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7580050528049469, + "num_tokens": 1008338571.0, + "step": 3506 + }, + { + "epoch": 1.2489759572573464, + "grad_norm": 0.4559433162212372, + "learning_rate": 1e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.771881490945816, + "num_tokens": 1008662210.0, + "step": 3507 + }, + { + "epoch": 1.2493321460373998, + "grad_norm": 0.45423147082328796, + "learning_rate": 1e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7516300231218338, + "num_tokens": 1008951459.0, + "step": 3508 + }, + { + "epoch": 1.2496883348174532, + "grad_norm": 0.44147735834121704, + "learning_rate": 1e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7450189292430878, + "num_tokens": 1009287763.0, + "step": 3509 + }, + { + "epoch": 1.2500445235975066, + "grad_norm": 0.46902498602867126, + "learning_rate": 1e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7696200162172318, + "num_tokens": 1009574834.0, + "step": 3510 + }, + { + "epoch": 1.25040071237756, + "grad_norm": 0.5515185594558716, + "learning_rate": 1e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7682311683893204, + "num_tokens": 1009815743.0, + "step": 3511 + }, + { + "epoch": 1.2507569011576134, + "grad_norm": 0.47423696517944336, + "learning_rate": 1e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7568088620901108, + "num_tokens": 1010091209.0, + "step": 3512 + }, + { + "epoch": 1.251113089937667, + "grad_norm": 0.478875070810318, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7722838222980499, + "num_tokens": 1010404015.0, + "step": 3513 + }, + { + "epoch": 1.2514692787177204, + "grad_norm": 0.4752337634563446, + "learning_rate": 1e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7608414143323898, + "num_tokens": 1010680511.0, + "step": 3514 + }, + { + "epoch": 1.2518254674977738, + "grad_norm": 0.48404428362846375, + "learning_rate": 1e-06, + "loss": 0.762, + "mean_token_accuracy": 0.7563730776309967, + "num_tokens": 1010968244.0, + "step": 3515 + }, + { + "epoch": 1.2521816562778272, + "grad_norm": 0.5132765769958496, + "learning_rate": 1e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7787764519453049, + "num_tokens": 1011261116.0, + "step": 3516 + }, + { + "epoch": 1.2525378450578808, + "grad_norm": 0.49965715408325195, + "learning_rate": 1e-06, + "loss": 0.7105, + "mean_token_accuracy": 0.7734783589839935, + "num_tokens": 1011551849.0, + "step": 3517 + }, + { + "epoch": 1.2528940338379342, + "grad_norm": 0.4238196909427643, + "learning_rate": 1e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.7455191314220428, + "num_tokens": 1011872314.0, + "step": 3518 + }, + { + "epoch": 1.2532502226179876, + "grad_norm": 0.4561519920825958, + "learning_rate": 1e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7534107863903046, + "num_tokens": 1012172570.0, + "step": 3519 + }, + { + "epoch": 1.253606411398041, + "grad_norm": 0.4728517234325409, + "learning_rate": 1e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7770540565252304, + "num_tokens": 1012468601.0, + "step": 3520 + }, + { + "epoch": 1.2539626001780944, + "grad_norm": 0.4684954285621643, + "learning_rate": 1e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.7755542546510696, + "num_tokens": 1012765868.0, + "step": 3521 + }, + { + "epoch": 1.2543187889581477, + "grad_norm": 0.4650755822658539, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7652410566806793, + "num_tokens": 1013062969.0, + "step": 3522 + }, + { + "epoch": 1.2546749777382011, + "grad_norm": 0.47314345836639404, + "learning_rate": 1e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7771139442920685, + "num_tokens": 1013332829.0, + "step": 3523 + }, + { + "epoch": 1.2550311665182547, + "grad_norm": 0.4453634023666382, + "learning_rate": 1e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7828645259141922, + "num_tokens": 1013603059.0, + "step": 3524 + }, + { + "epoch": 1.2553873552983081, + "grad_norm": 0.5396776795387268, + "learning_rate": 1e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7486167103052139, + "num_tokens": 1013853080.0, + "step": 3525 + }, + { + "epoch": 1.2557435440783615, + "grad_norm": 0.5564213991165161, + "learning_rate": 1e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.763215646147728, + "num_tokens": 1014182743.0, + "step": 3526 + }, + { + "epoch": 1.256099732858415, + "grad_norm": 0.42705950140953064, + "learning_rate": 1e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.7840590178966522, + "num_tokens": 1014492445.0, + "step": 3527 + }, + { + "epoch": 1.2564559216384683, + "grad_norm": 0.4639733135700226, + "learning_rate": 1e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.7687772363424301, + "num_tokens": 1014763701.0, + "step": 3528 + }, + { + "epoch": 1.256812110418522, + "grad_norm": 0.5152373909950256, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7677876204252243, + "num_tokens": 1015054351.0, + "step": 3529 + }, + { + "epoch": 1.2571682991985753, + "grad_norm": 0.4691147804260254, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7675579935312271, + "num_tokens": 1015347336.0, + "step": 3530 + }, + { + "epoch": 1.2575244879786287, + "grad_norm": 0.5095771551132202, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7627057731151581, + "num_tokens": 1015614687.0, + "step": 3531 + }, + { + "epoch": 1.257880676758682, + "grad_norm": 0.4928966164588928, + "learning_rate": 1e-06, + "loss": 0.6643, + "mean_token_accuracy": 0.787971705198288, + "num_tokens": 1015856069.0, + "step": 3532 + }, + { + "epoch": 1.2582368655387355, + "grad_norm": 0.4738236367702484, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.775302529335022, + "num_tokens": 1016142610.0, + "step": 3533 + }, + { + "epoch": 1.2585930543187889, + "grad_norm": 0.45770829916000366, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7747640758752823, + "num_tokens": 1016446987.0, + "step": 3534 + }, + { + "epoch": 1.2589492430988423, + "grad_norm": 0.5124218463897705, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7550655454397202, + "num_tokens": 1016707757.0, + "step": 3535 + }, + { + "epoch": 1.2593054318788959, + "grad_norm": 0.4728641211986542, + "learning_rate": 1e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7737956941127777, + "num_tokens": 1017007103.0, + "step": 3536 + }, + { + "epoch": 1.2596616206589493, + "grad_norm": 0.4283669888973236, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7697613537311554, + "num_tokens": 1017322998.0, + "step": 3537 + }, + { + "epoch": 1.2600178094390027, + "grad_norm": 0.5235180854797363, + "learning_rate": 1e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7592007517814636, + "num_tokens": 1017599310.0, + "step": 3538 + }, + { + "epoch": 1.260373998219056, + "grad_norm": 0.4647628664970398, + "learning_rate": 1e-06, + "loss": 0.767, + "mean_token_accuracy": 0.762839287519455, + "num_tokens": 1017892873.0, + "step": 3539 + }, + { + "epoch": 1.2607301869991097, + "grad_norm": 0.4848429262638092, + "learning_rate": 1e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7831787467002869, + "num_tokens": 1018177484.0, + "step": 3540 + }, + { + "epoch": 1.261086375779163, + "grad_norm": 0.4681119918823242, + "learning_rate": 1e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.7758643329143524, + "num_tokens": 1018473667.0, + "step": 3541 + }, + { + "epoch": 1.2614425645592164, + "grad_norm": 0.5091527104377747, + "learning_rate": 1e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7745865881443024, + "num_tokens": 1018744533.0, + "step": 3542 + }, + { + "epoch": 1.2617987533392698, + "grad_norm": 0.4429212808609009, + "learning_rate": 1e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.7709570378065109, + "num_tokens": 1019035164.0, + "step": 3543 + }, + { + "epoch": 1.2621549421193232, + "grad_norm": 0.5098490118980408, + "learning_rate": 1e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.772436648607254, + "num_tokens": 1019330552.0, + "step": 3544 + }, + { + "epoch": 1.2625111308993766, + "grad_norm": 0.4760649502277374, + "learning_rate": 1e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7754227966070175, + "num_tokens": 1019603672.0, + "step": 3545 + }, + { + "epoch": 1.26286731967943, + "grad_norm": 0.4588225483894348, + "learning_rate": 1e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7738631963729858, + "num_tokens": 1019898920.0, + "step": 3546 + }, + { + "epoch": 1.2632235084594834, + "grad_norm": 0.5013775825500488, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7627276033163071, + "num_tokens": 1020193658.0, + "step": 3547 + }, + { + "epoch": 1.263579697239537, + "grad_norm": 0.45673274993896484, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7577039301395416, + "num_tokens": 1020481192.0, + "step": 3548 + }, + { + "epoch": 1.2639358860195904, + "grad_norm": 0.4939526319503784, + "learning_rate": 1e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7724394053220749, + "num_tokens": 1020797975.0, + "step": 3549 + }, + { + "epoch": 1.2642920747996438, + "grad_norm": 0.48097139596939087, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7443201541900635, + "num_tokens": 1021095783.0, + "step": 3550 + }, + { + "epoch": 1.2646482635796972, + "grad_norm": 0.43108731508255005, + "learning_rate": 1e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7669866681098938, + "num_tokens": 1021391784.0, + "step": 3551 + }, + { + "epoch": 1.2650044523597508, + "grad_norm": 0.443913996219635, + "learning_rate": 1e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7670021802186966, + "num_tokens": 1021709466.0, + "step": 3552 + }, + { + "epoch": 1.2653606411398042, + "grad_norm": 0.4773072600364685, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7663342356681824, + "num_tokens": 1021987818.0, + "step": 3553 + }, + { + "epoch": 1.2657168299198576, + "grad_norm": 0.47648119926452637, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.769950345158577, + "num_tokens": 1022291174.0, + "step": 3554 + }, + { + "epoch": 1.266073018699911, + "grad_norm": 0.49307718873023987, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7596073150634766, + "num_tokens": 1022571810.0, + "step": 3555 + }, + { + "epoch": 1.2664292074799643, + "grad_norm": 0.5062969923019409, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7584021389484406, + "num_tokens": 1022848385.0, + "step": 3556 + }, + { + "epoch": 1.2667853962600177, + "grad_norm": 0.5453900098800659, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.748366966843605, + "num_tokens": 1023090557.0, + "step": 3557 + }, + { + "epoch": 1.2671415850400711, + "grad_norm": 0.47272545099258423, + "learning_rate": 1e-06, + "loss": 0.652, + "mean_token_accuracy": 0.7887321561574936, + "num_tokens": 1023334591.0, + "step": 3558 + }, + { + "epoch": 1.2674977738201247, + "grad_norm": 0.4694611430168152, + "learning_rate": 1e-06, + "loss": 0.6905, + "mean_token_accuracy": 0.7762597054243088, + "num_tokens": 1023602010.0, + "step": 3559 + }, + { + "epoch": 1.2678539626001781, + "grad_norm": 0.48814910650253296, + "learning_rate": 1e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7701488584280014, + "num_tokens": 1023874562.0, + "step": 3560 + }, + { + "epoch": 1.2682101513802315, + "grad_norm": 0.45912817120552063, + "learning_rate": 1e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7685619294643402, + "num_tokens": 1024154493.0, + "step": 3561 + }, + { + "epoch": 1.268566340160285, + "grad_norm": 0.5021430253982544, + "learning_rate": 1e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7721447497606277, + "num_tokens": 1024447851.0, + "step": 3562 + }, + { + "epoch": 1.2689225289403383, + "grad_norm": 0.5110353231430054, + "learning_rate": 1e-06, + "loss": 0.6671, + "mean_token_accuracy": 0.7903320342302322, + "num_tokens": 1024731826.0, + "step": 3563 + }, + { + "epoch": 1.269278717720392, + "grad_norm": 0.539222240447998, + "learning_rate": 1e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7644211798906326, + "num_tokens": 1024989972.0, + "step": 3564 + }, + { + "epoch": 1.2696349065004453, + "grad_norm": 0.46698224544525146, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7545800507068634, + "num_tokens": 1025296774.0, + "step": 3565 + }, + { + "epoch": 1.2699910952804987, + "grad_norm": 0.4877336621284485, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7640074640512466, + "num_tokens": 1025594638.0, + "step": 3566 + }, + { + "epoch": 1.270347284060552, + "grad_norm": 0.5155909657478333, + "learning_rate": 1e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7408341616392136, + "num_tokens": 1025860867.0, + "step": 3567 + }, + { + "epoch": 1.2707034728406055, + "grad_norm": 0.4887184798717499, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7416652888059616, + "num_tokens": 1026155462.0, + "step": 3568 + }, + { + "epoch": 1.2710596616206589, + "grad_norm": 0.4523795247077942, + "learning_rate": 1e-06, + "loss": 0.79, + "mean_token_accuracy": 0.7543814331293106, + "num_tokens": 1026451951.0, + "step": 3569 + }, + { + "epoch": 1.2714158504007123, + "grad_norm": 0.47653305530548096, + "learning_rate": 1e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7641021758317947, + "num_tokens": 1026775535.0, + "step": 3570 + }, + { + "epoch": 1.2717720391807659, + "grad_norm": 0.43926119804382324, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7566076070070267, + "num_tokens": 1027080957.0, + "step": 3571 + }, + { + "epoch": 1.2721282279608193, + "grad_norm": 0.4594828188419342, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7627887278795242, + "num_tokens": 1027376059.0, + "step": 3572 + }, + { + "epoch": 1.2724844167408726, + "grad_norm": 0.5242373943328857, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7691989988088608, + "num_tokens": 1027630174.0, + "step": 3573 + }, + { + "epoch": 1.272840605520926, + "grad_norm": 0.48686254024505615, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.766165241599083, + "num_tokens": 1027934493.0, + "step": 3574 + }, + { + "epoch": 1.2731967943009797, + "grad_norm": 0.5052089691162109, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7393674254417419, + "num_tokens": 1028216606.0, + "step": 3575 + }, + { + "epoch": 1.273552983081033, + "grad_norm": 0.4979570806026459, + "learning_rate": 1e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7647210508584976, + "num_tokens": 1028512232.0, + "step": 3576 + }, + { + "epoch": 1.2739091718610864, + "grad_norm": 0.49784600734710693, + "learning_rate": 1e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7474837154150009, + "num_tokens": 1028783917.0, + "step": 3577 + }, + { + "epoch": 1.2742653606411398, + "grad_norm": 0.4895167648792267, + "learning_rate": 1e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7653430849313736, + "num_tokens": 1029067730.0, + "step": 3578 + }, + { + "epoch": 1.2746215494211932, + "grad_norm": 0.4265000820159912, + "learning_rate": 1e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7742815911769867, + "num_tokens": 1029361602.0, + "step": 3579 + }, + { + "epoch": 1.2749777382012466, + "grad_norm": 0.4723930060863495, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7522703111171722, + "num_tokens": 1029640981.0, + "step": 3580 + }, + { + "epoch": 1.2753339269813, + "grad_norm": 0.4777098596096039, + "learning_rate": 1e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7798760384321213, + "num_tokens": 1029931831.0, + "step": 3581 + }, + { + "epoch": 1.2756901157613534, + "grad_norm": 0.49733200669288635, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7562752962112427, + "num_tokens": 1030210438.0, + "step": 3582 + }, + { + "epoch": 1.276046304541407, + "grad_norm": 0.4680071771144867, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7501515746116638, + "num_tokens": 1030503785.0, + "step": 3583 + }, + { + "epoch": 1.2764024933214604, + "grad_norm": 0.42977768182754517, + "learning_rate": 1e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7585065960884094, + "num_tokens": 1030828824.0, + "step": 3584 + }, + { + "epoch": 1.2767586821015138, + "grad_norm": 0.47027039527893066, + "learning_rate": 1e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7743438631296158, + "num_tokens": 1031122934.0, + "step": 3585 + }, + { + "epoch": 1.2771148708815672, + "grad_norm": 0.473406046628952, + "learning_rate": 1e-06, + "loss": 0.6829, + "mean_token_accuracy": 0.7802401930093765, + "num_tokens": 1031402184.0, + "step": 3586 + }, + { + "epoch": 1.2774710596616208, + "grad_norm": 0.481256365776062, + "learning_rate": 1e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.7571742683649063, + "num_tokens": 1031658848.0, + "step": 3587 + }, + { + "epoch": 1.2778272484416742, + "grad_norm": 0.4317144751548767, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7666198760271072, + "num_tokens": 1031977144.0, + "step": 3588 + }, + { + "epoch": 1.2781834372217276, + "grad_norm": 0.4854028820991516, + "learning_rate": 1e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7614231407642365, + "num_tokens": 1032238335.0, + "step": 3589 + }, + { + "epoch": 1.278539626001781, + "grad_norm": 0.4655570387840271, + "learning_rate": 1e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7531095743179321, + "num_tokens": 1032483524.0, + "step": 3590 + }, + { + "epoch": 1.2788958147818343, + "grad_norm": 0.45077061653137207, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7619896829128265, + "num_tokens": 1032776357.0, + "step": 3591 + }, + { + "epoch": 1.2792520035618877, + "grad_norm": 0.49902400374412537, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7560277283191681, + "num_tokens": 1033026257.0, + "step": 3592 + }, + { + "epoch": 1.2796081923419411, + "grad_norm": 0.4690985083580017, + "learning_rate": 1e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7756514549255371, + "num_tokens": 1033327993.0, + "step": 3593 + }, + { + "epoch": 1.2799643811219947, + "grad_norm": 0.5245437026023865, + "learning_rate": 1e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7644493579864502, + "num_tokens": 1033621862.0, + "step": 3594 + }, + { + "epoch": 1.2803205699020481, + "grad_norm": 0.4120674729347229, + "learning_rate": 1e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7739885300397873, + "num_tokens": 1033938069.0, + "step": 3595 + }, + { + "epoch": 1.2806767586821015, + "grad_norm": 0.46101218461990356, + "learning_rate": 1e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7802729904651642, + "num_tokens": 1034207961.0, + "step": 3596 + }, + { + "epoch": 1.281032947462155, + "grad_norm": 0.4702095091342926, + "learning_rate": 1e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7606924772262573, + "num_tokens": 1034490560.0, + "step": 3597 + }, + { + "epoch": 1.2813891362422083, + "grad_norm": 0.4664632976055145, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7582918405532837, + "num_tokens": 1034774463.0, + "step": 3598 + }, + { + "epoch": 1.281745325022262, + "grad_norm": 0.4681837260723114, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7675877958536148, + "num_tokens": 1035055788.0, + "step": 3599 + }, + { + "epoch": 1.2821015138023153, + "grad_norm": 0.44706201553344727, + "learning_rate": 1e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7638308107852936, + "num_tokens": 1035377661.0, + "step": 3600 + }, + { + "epoch": 1.2824577025823687, + "grad_norm": 0.4796697497367859, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7635886520147324, + "num_tokens": 1035652061.0, + "step": 3601 + }, + { + "epoch": 1.282813891362422, + "grad_norm": 0.49038517475128174, + "learning_rate": 1e-06, + "loss": 0.772, + "mean_token_accuracy": 0.752619206905365, + "num_tokens": 1035933871.0, + "step": 3602 + }, + { + "epoch": 1.2831700801424755, + "grad_norm": 0.454465389251709, + "learning_rate": 1e-06, + "loss": 0.7266, + "mean_token_accuracy": 0.7713741511106491, + "num_tokens": 1036232205.0, + "step": 3603 + }, + { + "epoch": 1.2835262689225289, + "grad_norm": 0.49663087725639343, + "learning_rate": 1e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7721648514270782, + "num_tokens": 1036503095.0, + "step": 3604 + }, + { + "epoch": 1.2838824577025822, + "grad_norm": 0.5395470261573792, + "learning_rate": 1e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7494030892848969, + "num_tokens": 1036755368.0, + "step": 3605 + }, + { + "epoch": 1.2842386464826359, + "grad_norm": 0.5092273354530334, + "learning_rate": 1e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7602948248386383, + "num_tokens": 1037016517.0, + "step": 3606 + }, + { + "epoch": 1.2845948352626892, + "grad_norm": 0.4617347717285156, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7535844594240189, + "num_tokens": 1037340344.0, + "step": 3607 + }, + { + "epoch": 1.2849510240427426, + "grad_norm": 0.48978281021118164, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7674710154533386, + "num_tokens": 1037626784.0, + "step": 3608 + }, + { + "epoch": 1.285307212822796, + "grad_norm": 0.5012942552566528, + "learning_rate": 1e-06, + "loss": 0.7905, + "mean_token_accuracy": 0.752042680978775, + "num_tokens": 1037914451.0, + "step": 3609 + }, + { + "epoch": 1.2856634016028496, + "grad_norm": 0.49572983384132385, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.757570430636406, + "num_tokens": 1038201661.0, + "step": 3610 + }, + { + "epoch": 1.286019590382903, + "grad_norm": 0.45315638184547424, + "learning_rate": 1e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.7678591459989548, + "num_tokens": 1038502849.0, + "step": 3611 + }, + { + "epoch": 1.2863757791629564, + "grad_norm": 0.4299738109111786, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.768507570028305, + "num_tokens": 1038816220.0, + "step": 3612 + }, + { + "epoch": 1.2867319679430098, + "grad_norm": 0.44953107833862305, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7629982680082321, + "num_tokens": 1039123483.0, + "step": 3613 + }, + { + "epoch": 1.2870881567230632, + "grad_norm": 0.4348424971103668, + "learning_rate": 1e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7692027240991592, + "num_tokens": 1039409535.0, + "step": 3614 + }, + { + "epoch": 1.2874443455031166, + "grad_norm": 0.4631558656692505, + "learning_rate": 1e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.7837674915790558, + "num_tokens": 1039699630.0, + "step": 3615 + }, + { + "epoch": 1.28780053428317, + "grad_norm": 0.44234511256217957, + "learning_rate": 1e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.772998720407486, + "num_tokens": 1039993143.0, + "step": 3616 + }, + { + "epoch": 1.2881567230632234, + "grad_norm": 0.45917612314224243, + "learning_rate": 1e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7524103969335556, + "num_tokens": 1040288897.0, + "step": 3617 + }, + { + "epoch": 1.288512911843277, + "grad_norm": 0.47668829560279846, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7594865709543228, + "num_tokens": 1040565114.0, + "step": 3618 + }, + { + "epoch": 1.2888691006233304, + "grad_norm": 0.42857658863067627, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7629292607307434, + "num_tokens": 1040856234.0, + "step": 3619 + }, + { + "epoch": 1.2892252894033838, + "grad_norm": 0.5182804465293884, + "learning_rate": 1e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7743938565254211, + "num_tokens": 1041127617.0, + "step": 3620 + }, + { + "epoch": 1.2895814781834372, + "grad_norm": 0.5767751336097717, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7566635608673096, + "num_tokens": 1041372964.0, + "step": 3621 + }, + { + "epoch": 1.2899376669634908, + "grad_norm": 0.4120023548603058, + "learning_rate": 1e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7720640301704407, + "num_tokens": 1041694639.0, + "step": 3622 + }, + { + "epoch": 1.2902938557435442, + "grad_norm": 0.5069987177848816, + "learning_rate": 1e-06, + "loss": 0.6662, + "mean_token_accuracy": 0.7857145518064499, + "num_tokens": 1041957139.0, + "step": 3623 + }, + { + "epoch": 1.2906500445235976, + "grad_norm": 0.4454066753387451, + "learning_rate": 1e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.773329570889473, + "num_tokens": 1042275758.0, + "step": 3624 + }, + { + "epoch": 1.291006233303651, + "grad_norm": 0.45144230127334595, + "learning_rate": 1e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7481771856546402, + "num_tokens": 1042568789.0, + "step": 3625 + }, + { + "epoch": 1.2913624220837043, + "grad_norm": 0.47756436467170715, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7624572366476059, + "num_tokens": 1042849845.0, + "step": 3626 + }, + { + "epoch": 1.2917186108637577, + "grad_norm": 0.494803249835968, + "learning_rate": 1e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7499635964632034, + "num_tokens": 1043109776.0, + "step": 3627 + }, + { + "epoch": 1.2920747996438111, + "grad_norm": 0.4946998655796051, + "learning_rate": 1e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7787090092897415, + "num_tokens": 1043382257.0, + "step": 3628 + }, + { + "epoch": 1.2924309884238647, + "grad_norm": 0.414792001247406, + "learning_rate": 1e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7735063582658768, + "num_tokens": 1043708584.0, + "step": 3629 + }, + { + "epoch": 1.2927871772039181, + "grad_norm": 0.4408905804157257, + "learning_rate": 1e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7657153308391571, + "num_tokens": 1044035672.0, + "step": 3630 + }, + { + "epoch": 1.2931433659839715, + "grad_norm": 0.4795153737068176, + "learning_rate": 1e-06, + "loss": 0.738, + "mean_token_accuracy": 0.765904426574707, + "num_tokens": 1044310888.0, + "step": 3631 + }, + { + "epoch": 1.293499554764025, + "grad_norm": 0.5298268795013428, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7446250915527344, + "num_tokens": 1044571602.0, + "step": 3632 + }, + { + "epoch": 1.2938557435440783, + "grad_norm": 0.4850862920284271, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7486160546541214, + "num_tokens": 1044918236.0, + "step": 3633 + }, + { + "epoch": 1.294211932324132, + "grad_norm": 0.5270673036575317, + "learning_rate": 1e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.7759179919958115, + "num_tokens": 1045194193.0, + "step": 3634 + }, + { + "epoch": 1.2945681211041853, + "grad_norm": 0.5079606771469116, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7578573077917099, + "num_tokens": 1045472405.0, + "step": 3635 + }, + { + "epoch": 1.2949243098842387, + "grad_norm": 0.4991551339626312, + "learning_rate": 1e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.776159331202507, + "num_tokens": 1045753427.0, + "step": 3636 + }, + { + "epoch": 1.295280498664292, + "grad_norm": 0.4749472737312317, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7575354427099228, + "num_tokens": 1046064881.0, + "step": 3637 + }, + { + "epoch": 1.2956366874443455, + "grad_norm": 0.5045127868652344, + "learning_rate": 1e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7543854266405106, + "num_tokens": 1046370130.0, + "step": 3638 + }, + { + "epoch": 1.2959928762243988, + "grad_norm": 0.5005964040756226, + "learning_rate": 1e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.793670728802681, + "num_tokens": 1046657552.0, + "step": 3639 + }, + { + "epoch": 1.2963490650044522, + "grad_norm": 0.46490204334259033, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7659112811088562, + "num_tokens": 1046964285.0, + "step": 3640 + }, + { + "epoch": 1.2967052537845059, + "grad_norm": 0.44400495290756226, + "learning_rate": 1e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7691611349582672, + "num_tokens": 1047256281.0, + "step": 3641 + }, + { + "epoch": 1.2970614425645592, + "grad_norm": 0.44453227519989014, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.75918348133564, + "num_tokens": 1047549434.0, + "step": 3642 + }, + { + "epoch": 1.2974176313446126, + "grad_norm": 0.43459945917129517, + "learning_rate": 1e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7576310336589813, + "num_tokens": 1047846163.0, + "step": 3643 + }, + { + "epoch": 1.297773820124666, + "grad_norm": 0.4659477472305298, + "learning_rate": 1e-06, + "loss": 0.6822, + "mean_token_accuracy": 0.7782297879457474, + "num_tokens": 1048151220.0, + "step": 3644 + }, + { + "epoch": 1.2981300089047196, + "grad_norm": 0.4623878300189972, + "learning_rate": 1e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7572034150362015, + "num_tokens": 1048427441.0, + "step": 3645 + }, + { + "epoch": 1.298486197684773, + "grad_norm": 0.4745858609676361, + "learning_rate": 1e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7711852937936783, + "num_tokens": 1048704897.0, + "step": 3646 + }, + { + "epoch": 1.2988423864648264, + "grad_norm": 0.4934593141078949, + "learning_rate": 1e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.7817897498607635, + "num_tokens": 1048986394.0, + "step": 3647 + }, + { + "epoch": 1.2991985752448798, + "grad_norm": 0.5265190601348877, + "learning_rate": 1e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7766574025154114, + "num_tokens": 1049256848.0, + "step": 3648 + }, + { + "epoch": 1.2995547640249332, + "grad_norm": 0.4521913528442383, + "learning_rate": 1e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.7498764544725418, + "num_tokens": 1049567489.0, + "step": 3649 + }, + { + "epoch": 1.2999109528049866, + "grad_norm": 0.47937190532684326, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7618131637573242, + "num_tokens": 1049863073.0, + "step": 3650 + }, + { + "epoch": 1.30026714158504, + "grad_norm": 0.4992343783378601, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7701884806156158, + "num_tokens": 1050140664.0, + "step": 3651 + }, + { + "epoch": 1.3006233303650934, + "grad_norm": 0.5408013463020325, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.765448272228241, + "num_tokens": 1050427563.0, + "step": 3652 + }, + { + "epoch": 1.300979519145147, + "grad_norm": 0.4132370054721832, + "learning_rate": 1e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.7508416920900345, + "num_tokens": 1050736026.0, + "step": 3653 + }, + { + "epoch": 1.3013357079252004, + "grad_norm": 0.46811896562576294, + "learning_rate": 1e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7768691629171371, + "num_tokens": 1051032603.0, + "step": 3654 + }, + { + "epoch": 1.3016918967052538, + "grad_norm": 0.44941988587379456, + "learning_rate": 1e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7549004107713699, + "num_tokens": 1051341894.0, + "step": 3655 + }, + { + "epoch": 1.3020480854853071, + "grad_norm": 0.4924198389053345, + "learning_rate": 1e-06, + "loss": 0.6413, + "mean_token_accuracy": 0.7932725995779037, + "num_tokens": 1051596496.0, + "step": 3656 + }, + { + "epoch": 1.3024042742653608, + "grad_norm": 0.5171330571174622, + "learning_rate": 1e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7636119276285172, + "num_tokens": 1051873713.0, + "step": 3657 + }, + { + "epoch": 1.3027604630454142, + "grad_norm": 0.48488277196884155, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7521077245473862, + "num_tokens": 1052119852.0, + "step": 3658 + }, + { + "epoch": 1.3031166518254675, + "grad_norm": 0.49804556369781494, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7374479472637177, + "num_tokens": 1052400009.0, + "step": 3659 + }, + { + "epoch": 1.303472840605521, + "grad_norm": 0.5026245713233948, + "learning_rate": 1e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7772878259420395, + "num_tokens": 1052702140.0, + "step": 3660 + }, + { + "epoch": 1.3038290293855743, + "grad_norm": 0.438184529542923, + "learning_rate": 1e-06, + "loss": 0.6487, + "mean_token_accuracy": 0.7906187027692795, + "num_tokens": 1052999639.0, + "step": 3661 + }, + { + "epoch": 1.3041852181656277, + "grad_norm": 0.4512130320072174, + "learning_rate": 1e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.7735421657562256, + "num_tokens": 1053307640.0, + "step": 3662 + }, + { + "epoch": 1.304541406945681, + "grad_norm": 0.4467358887195587, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7541415691375732, + "num_tokens": 1053621608.0, + "step": 3663 + }, + { + "epoch": 1.3048975957257347, + "grad_norm": 0.44955340027809143, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7641738057136536, + "num_tokens": 1053931520.0, + "step": 3664 + }, + { + "epoch": 1.305253784505788, + "grad_norm": 0.44325342774391174, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7422693073749542, + "num_tokens": 1054222579.0, + "step": 3665 + }, + { + "epoch": 1.3056099732858415, + "grad_norm": 0.48430439829826355, + "learning_rate": 1e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.7719481438398361, + "num_tokens": 1054479661.0, + "step": 3666 + }, + { + "epoch": 1.3059661620658949, + "grad_norm": 0.4886106252670288, + "learning_rate": 1e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7698508650064468, + "num_tokens": 1054763685.0, + "step": 3667 + }, + { + "epoch": 1.3063223508459483, + "grad_norm": 0.4505123496055603, + "learning_rate": 1e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7635816335678101, + "num_tokens": 1055073206.0, + "step": 3668 + }, + { + "epoch": 1.306678539626002, + "grad_norm": 0.5230811238288879, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7423640042543411, + "num_tokens": 1055344774.0, + "step": 3669 + }, + { + "epoch": 1.3070347284060553, + "grad_norm": 0.4904353618621826, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7684341222047806, + "num_tokens": 1055604364.0, + "step": 3670 + }, + { + "epoch": 1.3073909171861087, + "grad_norm": 0.5099927186965942, + "learning_rate": 1e-06, + "loss": 0.825, + "mean_token_accuracy": 0.736919954419136, + "num_tokens": 1055864858.0, + "step": 3671 + }, + { + "epoch": 1.307747105966162, + "grad_norm": 0.4471283257007599, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7736976593732834, + "num_tokens": 1056178257.0, + "step": 3672 + }, + { + "epoch": 1.3081032947462155, + "grad_norm": 0.4667149484157562, + "learning_rate": 1e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7709114998579025, + "num_tokens": 1056462905.0, + "step": 3673 + }, + { + "epoch": 1.3084594835262688, + "grad_norm": 0.4624282121658325, + "learning_rate": 1e-06, + "loss": 0.662, + "mean_token_accuracy": 0.7873749136924744, + "num_tokens": 1056746673.0, + "step": 3674 + }, + { + "epoch": 1.3088156723063222, + "grad_norm": 0.518742024898529, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7596093118190765, + "num_tokens": 1057014811.0, + "step": 3675 + }, + { + "epoch": 1.3091718610863758, + "grad_norm": 0.48975270986557007, + "learning_rate": 1e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7537478804588318, + "num_tokens": 1057323038.0, + "step": 3676 + }, + { + "epoch": 1.3095280498664292, + "grad_norm": 0.5506721138954163, + "learning_rate": 1e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7588405460119247, + "num_tokens": 1057560869.0, + "step": 3677 + }, + { + "epoch": 1.3098842386464826, + "grad_norm": 0.4849891662597656, + "learning_rate": 1e-06, + "loss": 0.6398, + "mean_token_accuracy": 0.7900753170251846, + "num_tokens": 1057836375.0, + "step": 3678 + }, + { + "epoch": 1.310240427426536, + "grad_norm": 0.4535049796104431, + "learning_rate": 1e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7685125470161438, + "num_tokens": 1058132021.0, + "step": 3679 + }, + { + "epoch": 1.3105966162065896, + "grad_norm": 0.43840718269348145, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7617556750774384, + "num_tokens": 1058451180.0, + "step": 3680 + }, + { + "epoch": 1.310952804986643, + "grad_norm": 0.42564740777015686, + "learning_rate": 1e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7542393803596497, + "num_tokens": 1058750149.0, + "step": 3681 + }, + { + "epoch": 1.3113089937666964, + "grad_norm": 0.44749629497528076, + "learning_rate": 1e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7735751122236252, + "num_tokens": 1059066902.0, + "step": 3682 + }, + { + "epoch": 1.3116651825467498, + "grad_norm": 0.514647364616394, + "learning_rate": 1e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7776380181312561, + "num_tokens": 1059327292.0, + "step": 3683 + }, + { + "epoch": 1.3120213713268032, + "grad_norm": 0.46687012910842896, + "learning_rate": 1e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.75188347697258, + "num_tokens": 1059597215.0, + "step": 3684 + }, + { + "epoch": 1.3123775601068566, + "grad_norm": 0.4747070074081421, + "learning_rate": 1e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7519027888774872, + "num_tokens": 1059869681.0, + "step": 3685 + }, + { + "epoch": 1.31273374888691, + "grad_norm": 0.48576033115386963, + "learning_rate": 1e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7666871249675751, + "num_tokens": 1060145054.0, + "step": 3686 + }, + { + "epoch": 1.3130899376669634, + "grad_norm": 0.482905775308609, + "learning_rate": 1e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7528709918260574, + "num_tokens": 1060423062.0, + "step": 3687 + }, + { + "epoch": 1.313446126447017, + "grad_norm": 0.47933030128479004, + "learning_rate": 1e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7697466164827347, + "num_tokens": 1060704454.0, + "step": 3688 + }, + { + "epoch": 1.3138023152270704, + "grad_norm": 0.49530845880508423, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.758419468998909, + "num_tokens": 1061006340.0, + "step": 3689 + }, + { + "epoch": 1.3141585040071238, + "grad_norm": 0.5145691633224487, + "learning_rate": 1e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7601046860218048, + "num_tokens": 1061269081.0, + "step": 3690 + }, + { + "epoch": 1.3145146927871771, + "grad_norm": 0.4916117787361145, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7611568868160248, + "num_tokens": 1061525817.0, + "step": 3691 + }, + { + "epoch": 1.3148708815672308, + "grad_norm": 0.47738924622535706, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7589985430240631, + "num_tokens": 1061798363.0, + "step": 3692 + }, + { + "epoch": 1.3152270703472841, + "grad_norm": 0.5020068287849426, + "learning_rate": 1e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7704204320907593, + "num_tokens": 1062070796.0, + "step": 3693 + }, + { + "epoch": 1.3155832591273375, + "grad_norm": 0.43436959385871887, + "learning_rate": 1e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7605973184108734, + "num_tokens": 1062383426.0, + "step": 3694 + }, + { + "epoch": 1.315939447907391, + "grad_norm": 0.478376179933548, + "learning_rate": 1e-06, + "loss": 0.666, + "mean_token_accuracy": 0.7848488390445709, + "num_tokens": 1062657736.0, + "step": 3695 + }, + { + "epoch": 1.3162956366874443, + "grad_norm": 0.47168678045272827, + "learning_rate": 1e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.7547516524791718, + "num_tokens": 1062944592.0, + "step": 3696 + }, + { + "epoch": 1.3166518254674977, + "grad_norm": 0.5182751417160034, + "learning_rate": 1e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7582010179758072, + "num_tokens": 1063201578.0, + "step": 3697 + }, + { + "epoch": 1.317008014247551, + "grad_norm": 0.49624550342559814, + "learning_rate": 1e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7457247525453568, + "num_tokens": 1063473733.0, + "step": 3698 + }, + { + "epoch": 1.3173642030276047, + "grad_norm": 0.5022448897361755, + "learning_rate": 1e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7704313844442368, + "num_tokens": 1063762315.0, + "step": 3699 + }, + { + "epoch": 1.317720391807658, + "grad_norm": 0.479889839887619, + "learning_rate": 1e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.7699317634105682, + "num_tokens": 1064035392.0, + "step": 3700 + }, + { + "epoch": 1.3180765805877115, + "grad_norm": 0.4782141447067261, + "learning_rate": 1e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7553329765796661, + "num_tokens": 1064306775.0, + "step": 3701 + }, + { + "epoch": 1.3184327693677649, + "grad_norm": 0.48106515407562256, + "learning_rate": 1e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7730757147073746, + "num_tokens": 1064604808.0, + "step": 3702 + }, + { + "epoch": 1.3187889581478183, + "grad_norm": 0.516472578048706, + "learning_rate": 1e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7765943259000778, + "num_tokens": 1064888363.0, + "step": 3703 + }, + { + "epoch": 1.3191451469278719, + "grad_norm": 0.49194884300231934, + "learning_rate": 1e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7606920748949051, + "num_tokens": 1065165103.0, + "step": 3704 + }, + { + "epoch": 1.3195013357079253, + "grad_norm": 0.4507330358028412, + "learning_rate": 1e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7765598893165588, + "num_tokens": 1065475301.0, + "step": 3705 + }, + { + "epoch": 1.3198575244879787, + "grad_norm": 0.4678279161453247, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7500783503055573, + "num_tokens": 1065760349.0, + "step": 3706 + }, + { + "epoch": 1.320213713268032, + "grad_norm": 0.4754440188407898, + "learning_rate": 1e-06, + "loss": 0.7449, + "mean_token_accuracy": 0.7635183781385422, + "num_tokens": 1066032073.0, + "step": 3707 + }, + { + "epoch": 1.3205699020480854, + "grad_norm": 0.4939599335193634, + "learning_rate": 1e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.7630105465650558, + "num_tokens": 1066291603.0, + "step": 3708 + }, + { + "epoch": 1.3209260908281388, + "grad_norm": 0.5283200740814209, + "learning_rate": 1e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7468345612287521, + "num_tokens": 1066584403.0, + "step": 3709 + }, + { + "epoch": 1.3212822796081922, + "grad_norm": 0.4492529332637787, + "learning_rate": 1e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7623673975467682, + "num_tokens": 1066852163.0, + "step": 3710 + }, + { + "epoch": 1.3216384683882458, + "grad_norm": 0.5161863565444946, + "learning_rate": 1e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7594075351953506, + "num_tokens": 1067114334.0, + "step": 3711 + }, + { + "epoch": 1.3219946571682992, + "grad_norm": 0.45840775966644287, + "learning_rate": 1e-06, + "loss": 0.6703, + "mean_token_accuracy": 0.7830507755279541, + "num_tokens": 1067402875.0, + "step": 3712 + }, + { + "epoch": 1.3223508459483526, + "grad_norm": 0.5358456373214722, + "learning_rate": 1e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.7588595300912857, + "num_tokens": 1067657890.0, + "step": 3713 + }, + { + "epoch": 1.322707034728406, + "grad_norm": 0.4695528745651245, + "learning_rate": 1e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.7863702028989792, + "num_tokens": 1067937614.0, + "step": 3714 + }, + { + "epoch": 1.3230632235084596, + "grad_norm": 0.4806866943836212, + "learning_rate": 1e-06, + "loss": 0.67, + "mean_token_accuracy": 0.7850033789873123, + "num_tokens": 1068237077.0, + "step": 3715 + }, + { + "epoch": 1.323419412288513, + "grad_norm": 0.48368188738822937, + "learning_rate": 1e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7597058713436127, + "num_tokens": 1068543392.0, + "step": 3716 + }, + { + "epoch": 1.3237756010685664, + "grad_norm": 0.49792855978012085, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7530209124088287, + "num_tokens": 1068846339.0, + "step": 3717 + }, + { + "epoch": 1.3241317898486198, + "grad_norm": 0.49009615182876587, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7635112255811691, + "num_tokens": 1069108114.0, + "step": 3718 + }, + { + "epoch": 1.3244879786286732, + "grad_norm": 0.5136198997497559, + "learning_rate": 1e-06, + "loss": 0.7898, + "mean_token_accuracy": 0.7498510926961899, + "num_tokens": 1069385713.0, + "step": 3719 + }, + { + "epoch": 1.3248441674087266, + "grad_norm": 0.5191143751144409, + "learning_rate": 1e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7625640034675598, + "num_tokens": 1069647610.0, + "step": 3720 + }, + { + "epoch": 1.32520035618878, + "grad_norm": 0.4848462641239166, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7520173043012619, + "num_tokens": 1069934900.0, + "step": 3721 + }, + { + "epoch": 1.3255565449688334, + "grad_norm": 0.4977709650993347, + "learning_rate": 1e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7461830675601959, + "num_tokens": 1070218503.0, + "step": 3722 + }, + { + "epoch": 1.325912733748887, + "grad_norm": 0.48739972710609436, + "learning_rate": 1e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7658344209194183, + "num_tokens": 1070494283.0, + "step": 3723 + }, + { + "epoch": 1.3262689225289404, + "grad_norm": 0.46904507279396057, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7524495124816895, + "num_tokens": 1070791704.0, + "step": 3724 + }, + { + "epoch": 1.3266251113089937, + "grad_norm": 0.48268231749534607, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7488872408866882, + "num_tokens": 1071108846.0, + "step": 3725 + }, + { + "epoch": 1.3269813000890471, + "grad_norm": 0.5079935193061829, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7674991190433502, + "num_tokens": 1071405515.0, + "step": 3726 + }, + { + "epoch": 1.3273374888691007, + "grad_norm": 0.45991775393486023, + "learning_rate": 1e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7693669646978378, + "num_tokens": 1071684508.0, + "step": 3727 + }, + { + "epoch": 1.3276936776491541, + "grad_norm": 0.4439684748649597, + "learning_rate": 1e-06, + "loss": 0.6211, + "mean_token_accuracy": 0.8007578551769257, + "num_tokens": 1071998369.0, + "step": 3728 + }, + { + "epoch": 1.3280498664292075, + "grad_norm": 0.4863726794719696, + "learning_rate": 1e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7742959260940552, + "num_tokens": 1072303218.0, + "step": 3729 + }, + { + "epoch": 1.328406055209261, + "grad_norm": 0.44777658581733704, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7645092308521271, + "num_tokens": 1072581387.0, + "step": 3730 + }, + { + "epoch": 1.3287622439893143, + "grad_norm": 0.486291766166687, + "learning_rate": 1e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7712362408638, + "num_tokens": 1072869438.0, + "step": 3731 + }, + { + "epoch": 1.3291184327693677, + "grad_norm": 0.4710068702697754, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7604732811450958, + "num_tokens": 1073188292.0, + "step": 3732 + }, + { + "epoch": 1.329474621549421, + "grad_norm": 0.4880046248435974, + "learning_rate": 1e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7422355562448502, + "num_tokens": 1073455021.0, + "step": 3733 + }, + { + "epoch": 1.3298308103294747, + "grad_norm": 0.5045356154441833, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7529123723506927, + "num_tokens": 1073755129.0, + "step": 3734 + }, + { + "epoch": 1.330186999109528, + "grad_norm": 0.5180131793022156, + "learning_rate": 1e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7647038996219635, + "num_tokens": 1074065529.0, + "step": 3735 + }, + { + "epoch": 1.3305431878895815, + "grad_norm": 0.48748084902763367, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7721623033285141, + "num_tokens": 1074363441.0, + "step": 3736 + }, + { + "epoch": 1.3308993766696349, + "grad_norm": 0.47577881813049316, + "learning_rate": 1e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7795725762844086, + "num_tokens": 1074627887.0, + "step": 3737 + }, + { + "epoch": 1.3312555654496883, + "grad_norm": 0.5634731650352478, + "learning_rate": 1e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7752998918294907, + "num_tokens": 1074882855.0, + "step": 3738 + }, + { + "epoch": 1.3316117542297419, + "grad_norm": 0.49300357699394226, + "learning_rate": 1e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7664354890584946, + "num_tokens": 1075186046.0, + "step": 3739 + }, + { + "epoch": 1.3319679430097953, + "grad_norm": 0.4963620603084564, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7514042407274246, + "num_tokens": 1075447710.0, + "step": 3740 + }, + { + "epoch": 1.3323241317898487, + "grad_norm": 0.5017169713973999, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7704362273216248, + "num_tokens": 1075716111.0, + "step": 3741 + }, + { + "epoch": 1.332680320569902, + "grad_norm": 0.5060286521911621, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7577271461486816, + "num_tokens": 1075994541.0, + "step": 3742 + }, + { + "epoch": 1.3330365093499554, + "grad_norm": 0.5221025347709656, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7694423198699951, + "num_tokens": 1076265995.0, + "step": 3743 + }, + { + "epoch": 1.3333926981300088, + "grad_norm": 0.4898301959037781, + "learning_rate": 1e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7800524234771729, + "num_tokens": 1076507788.0, + "step": 3744 + }, + { + "epoch": 1.3337488869100622, + "grad_norm": 0.46255454421043396, + "learning_rate": 1e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7658863812685013, + "num_tokens": 1076807082.0, + "step": 3745 + }, + { + "epoch": 1.3341050756901158, + "grad_norm": 0.48444369435310364, + "learning_rate": 1e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7617538273334503, + "num_tokens": 1077074696.0, + "step": 3746 + }, + { + "epoch": 1.3344612644701692, + "grad_norm": 0.5322958827018738, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7632095515727997, + "num_tokens": 1077346115.0, + "step": 3747 + }, + { + "epoch": 1.3348174532502226, + "grad_norm": 0.4627836048603058, + "learning_rate": 1e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7665006071329117, + "num_tokens": 1077625879.0, + "step": 3748 + }, + { + "epoch": 1.335173642030276, + "grad_norm": 0.4664059281349182, + "learning_rate": 1e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.7632392197847366, + "num_tokens": 1077920421.0, + "step": 3749 + }, + { + "epoch": 1.3355298308103296, + "grad_norm": 0.50117427110672, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7562578320503235, + "num_tokens": 1078203509.0, + "step": 3750 + }, + { + "epoch": 1.335886019590383, + "grad_norm": 0.4878578186035156, + "learning_rate": 1e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7726283073425293, + "num_tokens": 1078477413.0, + "step": 3751 + }, + { + "epoch": 1.3362422083704364, + "grad_norm": 0.48313403129577637, + "learning_rate": 1e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7678572535514832, + "num_tokens": 1078754297.0, + "step": 3752 + }, + { + "epoch": 1.3365983971504898, + "grad_norm": 0.4613833725452423, + "learning_rate": 1e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7773925364017487, + "num_tokens": 1079032174.0, + "step": 3753 + }, + { + "epoch": 1.3369545859305432, + "grad_norm": 0.5098069310188293, + "learning_rate": 1e-06, + "loss": 0.7631, + "mean_token_accuracy": 0.7571549564599991, + "num_tokens": 1079323206.0, + "step": 3754 + }, + { + "epoch": 1.3373107747105966, + "grad_norm": 0.44528600573539734, + "learning_rate": 1e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7667511701583862, + "num_tokens": 1079606321.0, + "step": 3755 + }, + { + "epoch": 1.33766696349065, + "grad_norm": 0.5236674547195435, + "learning_rate": 1e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7577677816152573, + "num_tokens": 1079861019.0, + "step": 3756 + }, + { + "epoch": 1.3380231522707033, + "grad_norm": 0.4987267851829529, + "learning_rate": 1e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7705605179071426, + "num_tokens": 1080133449.0, + "step": 3757 + }, + { + "epoch": 1.338379341050757, + "grad_norm": 0.4558723270893097, + "learning_rate": 1e-06, + "loss": 0.6647, + "mean_token_accuracy": 0.7851635366678238, + "num_tokens": 1080427351.0, + "step": 3758 + }, + { + "epoch": 1.3387355298308103, + "grad_norm": 0.46380770206451416, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7650863826274872, + "num_tokens": 1080693910.0, + "step": 3759 + }, + { + "epoch": 1.3390917186108637, + "grad_norm": 0.4745628833770752, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7568188309669495, + "num_tokens": 1081024104.0, + "step": 3760 + }, + { + "epoch": 1.3394479073909171, + "grad_norm": 0.4783805310726166, + "learning_rate": 1e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.7859385460615158, + "num_tokens": 1081326201.0, + "step": 3761 + }, + { + "epoch": 1.3398040961709707, + "grad_norm": 0.4914674758911133, + "learning_rate": 1e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7505426555871964, + "num_tokens": 1081594256.0, + "step": 3762 + }, + { + "epoch": 1.3401602849510241, + "grad_norm": 0.4764159321784973, + "learning_rate": 1e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7546903342008591, + "num_tokens": 1081848315.0, + "step": 3763 + }, + { + "epoch": 1.3405164737310775, + "grad_norm": 0.4383394122123718, + "learning_rate": 1e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.7621623575687408, + "num_tokens": 1082155083.0, + "step": 3764 + }, + { + "epoch": 1.340872662511131, + "grad_norm": 0.5035179257392883, + "learning_rate": 1e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7768388390541077, + "num_tokens": 1082424079.0, + "step": 3765 + }, + { + "epoch": 1.3412288512911843, + "grad_norm": 0.4828214943408966, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7445286512374878, + "num_tokens": 1082735873.0, + "step": 3766 + }, + { + "epoch": 1.3415850400712377, + "grad_norm": 0.4801713824272156, + "learning_rate": 1e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.7678918540477753, + "num_tokens": 1083034108.0, + "step": 3767 + }, + { + "epoch": 1.341941228851291, + "grad_norm": 0.47577834129333496, + "learning_rate": 1e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.7794003039598465, + "num_tokens": 1083317364.0, + "step": 3768 + }, + { + "epoch": 1.3422974176313447, + "grad_norm": 0.5252863168716431, + "learning_rate": 1e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7694414407014847, + "num_tokens": 1083608475.0, + "step": 3769 + }, + { + "epoch": 1.342653606411398, + "grad_norm": 0.4546639323234558, + "learning_rate": 1e-06, + "loss": 0.6603, + "mean_token_accuracy": 0.7896629422903061, + "num_tokens": 1083923359.0, + "step": 3770 + }, + { + "epoch": 1.3430097951914515, + "grad_norm": 0.5212105512619019, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7720352411270142, + "num_tokens": 1084184565.0, + "step": 3771 + }, + { + "epoch": 1.3433659839715049, + "grad_norm": 0.38307857513427734, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7781974822282791, + "num_tokens": 1084508155.0, + "step": 3772 + }, + { + "epoch": 1.3437221727515583, + "grad_norm": 0.46038466691970825, + "learning_rate": 1e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7567430436611176, + "num_tokens": 1084794149.0, + "step": 3773 + }, + { + "epoch": 1.3440783615316119, + "grad_norm": 0.4383854568004608, + "learning_rate": 1e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7643671482801437, + "num_tokens": 1085108325.0, + "step": 3774 + }, + { + "epoch": 1.3444345503116653, + "grad_norm": 0.4564778208732605, + "learning_rate": 1e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7633071541786194, + "num_tokens": 1085401616.0, + "step": 3775 + }, + { + "epoch": 1.3447907390917186, + "grad_norm": 0.48813173174858093, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7532582581043243, + "num_tokens": 1085683049.0, + "step": 3776 + }, + { + "epoch": 1.345146927871772, + "grad_norm": 0.42754065990448, + "learning_rate": 1e-06, + "loss": 0.714, + "mean_token_accuracy": 0.768312007188797, + "num_tokens": 1086006310.0, + "step": 3777 + }, + { + "epoch": 1.3455031166518254, + "grad_norm": 0.48953428864479065, + "learning_rate": 1e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.760013997554779, + "num_tokens": 1086281014.0, + "step": 3778 + }, + { + "epoch": 1.3458593054318788, + "grad_norm": 0.4592215120792389, + "learning_rate": 1e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.7783167809247971, + "num_tokens": 1086587628.0, + "step": 3779 + }, + { + "epoch": 1.3462154942119322, + "grad_norm": 0.48536303639411926, + "learning_rate": 1e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7691406905651093, + "num_tokens": 1086865395.0, + "step": 3780 + }, + { + "epoch": 1.3465716829919858, + "grad_norm": 0.4557923972606659, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7623979151248932, + "num_tokens": 1087143258.0, + "step": 3781 + }, + { + "epoch": 1.3469278717720392, + "grad_norm": 0.470993310213089, + "learning_rate": 1e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.763847753405571, + "num_tokens": 1087391509.0, + "step": 3782 + }, + { + "epoch": 1.3472840605520926, + "grad_norm": 0.47885042428970337, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7406162172555923, + "num_tokens": 1087675976.0, + "step": 3783 + }, + { + "epoch": 1.347640249332146, + "grad_norm": 0.4884292185306549, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7607478052377701, + "num_tokens": 1087977835.0, + "step": 3784 + }, + { + "epoch": 1.3479964381121996, + "grad_norm": 0.46368786692619324, + "learning_rate": 1e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.7845394611358643, + "num_tokens": 1088269771.0, + "step": 3785 + }, + { + "epoch": 1.348352626892253, + "grad_norm": 0.48919862508773804, + "learning_rate": 1e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.758544385433197, + "num_tokens": 1088533322.0, + "step": 3786 + }, + { + "epoch": 1.3487088156723064, + "grad_norm": 0.4559798538684845, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7718836814165115, + "num_tokens": 1088812377.0, + "step": 3787 + }, + { + "epoch": 1.3490650044523598, + "grad_norm": 0.4653835594654083, + "learning_rate": 1e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7692523002624512, + "num_tokens": 1089124751.0, + "step": 3788 + }, + { + "epoch": 1.3494211932324132, + "grad_norm": 0.5048410296440125, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7601106762886047, + "num_tokens": 1089410010.0, + "step": 3789 + }, + { + "epoch": 1.3497773820124666, + "grad_norm": 0.47482430934906006, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.766302078962326, + "num_tokens": 1089674384.0, + "step": 3790 + }, + { + "epoch": 1.35013357079252, + "grad_norm": 0.4729434549808502, + "learning_rate": 1e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7459762990474701, + "num_tokens": 1089961925.0, + "step": 3791 + }, + { + "epoch": 1.3504897595725733, + "grad_norm": 0.47905921936035156, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7704710066318512, + "num_tokens": 1090235826.0, + "step": 3792 + }, + { + "epoch": 1.350845948352627, + "grad_norm": 0.47376397252082825, + "learning_rate": 1e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7731785029172897, + "num_tokens": 1090496298.0, + "step": 3793 + }, + { + "epoch": 1.3512021371326803, + "grad_norm": 0.4650753438472748, + "learning_rate": 1e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.749660849571228, + "num_tokens": 1090811090.0, + "step": 3794 + }, + { + "epoch": 1.3515583259127337, + "grad_norm": 0.47953954339027405, + "learning_rate": 1e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.7462325990200043, + "num_tokens": 1091075039.0, + "step": 3795 + }, + { + "epoch": 1.3519145146927871, + "grad_norm": 0.4227738380432129, + "learning_rate": 1e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7536284625530243, + "num_tokens": 1091392607.0, + "step": 3796 + }, + { + "epoch": 1.3522707034728407, + "grad_norm": 0.4402693510055542, + "learning_rate": 1e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7731128036975861, + "num_tokens": 1091694879.0, + "step": 3797 + }, + { + "epoch": 1.3526268922528941, + "grad_norm": 0.47225138545036316, + "learning_rate": 1e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.770956888794899, + "num_tokens": 1091975214.0, + "step": 3798 + }, + { + "epoch": 1.3529830810329475, + "grad_norm": 0.4691768288612366, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.769352525472641, + "num_tokens": 1092270609.0, + "step": 3799 + }, + { + "epoch": 1.353339269813001, + "grad_norm": 0.4472627639770508, + "learning_rate": 1e-06, + "loss": 0.7304, + "mean_token_accuracy": 0.7679276168346405, + "num_tokens": 1092570368.0, + "step": 3800 + }, + { + "epoch": 1.3536954585930543, + "grad_norm": 0.457602322101593, + "learning_rate": 1e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7641647905111313, + "num_tokens": 1092872878.0, + "step": 3801 + }, + { + "epoch": 1.3540516473731077, + "grad_norm": 0.5004162192344666, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.756629005074501, + "num_tokens": 1093157717.0, + "step": 3802 + }, + { + "epoch": 1.354407836153161, + "grad_norm": 0.43473830819129944, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7473073303699493, + "num_tokens": 1093466097.0, + "step": 3803 + }, + { + "epoch": 1.3547640249332147, + "grad_norm": 0.45391830801963806, + "learning_rate": 1e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.781113475561142, + "num_tokens": 1093746808.0, + "step": 3804 + }, + { + "epoch": 1.355120213713268, + "grad_norm": 0.4970814883708954, + "learning_rate": 1e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7622795850038528, + "num_tokens": 1094025834.0, + "step": 3805 + }, + { + "epoch": 1.3554764024933215, + "grad_norm": 0.4869561195373535, + "learning_rate": 1e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7746341377496719, + "num_tokens": 1094316116.0, + "step": 3806 + }, + { + "epoch": 1.3558325912733749, + "grad_norm": 0.45666736364364624, + "learning_rate": 1e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7696642875671387, + "num_tokens": 1094620503.0, + "step": 3807 + }, + { + "epoch": 1.3561887800534282, + "grad_norm": 0.43428871035575867, + "learning_rate": 1e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7598038017749786, + "num_tokens": 1094915421.0, + "step": 3808 + }, + { + "epoch": 1.3565449688334819, + "grad_norm": 0.49193066358566284, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7590255737304688, + "num_tokens": 1095186040.0, + "step": 3809 + }, + { + "epoch": 1.3569011576135352, + "grad_norm": 0.46424517035484314, + "learning_rate": 1e-06, + "loss": 0.651, + "mean_token_accuracy": 0.788477823138237, + "num_tokens": 1095471468.0, + "step": 3810 + }, + { + "epoch": 1.3572573463935886, + "grad_norm": 0.5147121548652649, + "learning_rate": 1e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7738770842552185, + "num_tokens": 1095737029.0, + "step": 3811 + }, + { + "epoch": 1.357613535173642, + "grad_norm": 0.45550212264060974, + "learning_rate": 1e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7652814239263535, + "num_tokens": 1096024275.0, + "step": 3812 + }, + { + "epoch": 1.3579697239536954, + "grad_norm": 0.46431615948677063, + "learning_rate": 1e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7662677466869354, + "num_tokens": 1096315173.0, + "step": 3813 + }, + { + "epoch": 1.3583259127337488, + "grad_norm": 0.4657754600048065, + "learning_rate": 1e-06, + "loss": 0.6575, + "mean_token_accuracy": 0.7841320186853409, + "num_tokens": 1096605518.0, + "step": 3814 + }, + { + "epoch": 1.3586821015138022, + "grad_norm": 0.4682979881763458, + "learning_rate": 1e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.780966728925705, + "num_tokens": 1096880132.0, + "step": 3815 + }, + { + "epoch": 1.3590382902938558, + "grad_norm": 0.49287331104278564, + "learning_rate": 1e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.772055372595787, + "num_tokens": 1097167501.0, + "step": 3816 + }, + { + "epoch": 1.3593944790739092, + "grad_norm": 0.5041561722755432, + "learning_rate": 1e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7691480964422226, + "num_tokens": 1097439898.0, + "step": 3817 + }, + { + "epoch": 1.3597506678539626, + "grad_norm": 0.42915451526641846, + "learning_rate": 1e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7725946456193924, + "num_tokens": 1097735414.0, + "step": 3818 + }, + { + "epoch": 1.360106856634016, + "grad_norm": 0.43078380823135376, + "learning_rate": 1e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7775537818670273, + "num_tokens": 1098072181.0, + "step": 3819 + }, + { + "epoch": 1.3604630454140696, + "grad_norm": 0.5158278346061707, + "learning_rate": 1e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7523743361234665, + "num_tokens": 1098312944.0, + "step": 3820 + }, + { + "epoch": 1.360819234194123, + "grad_norm": 0.49038630723953247, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7676911652088165, + "num_tokens": 1098594653.0, + "step": 3821 + }, + { + "epoch": 1.3611754229741764, + "grad_norm": 0.5343278646469116, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.770739421248436, + "num_tokens": 1098848839.0, + "step": 3822 + }, + { + "epoch": 1.3615316117542298, + "grad_norm": 0.4443668723106384, + "learning_rate": 1e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7737371027469635, + "num_tokens": 1099175272.0, + "step": 3823 + }, + { + "epoch": 1.3618878005342832, + "grad_norm": 0.47027134895324707, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7725679874420166, + "num_tokens": 1099467803.0, + "step": 3824 + }, + { + "epoch": 1.3622439893143365, + "grad_norm": 0.461127370595932, + "learning_rate": 1e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7663535177707672, + "num_tokens": 1099740086.0, + "step": 3825 + }, + { + "epoch": 1.36260017809439, + "grad_norm": 0.43915310502052307, + "learning_rate": 1e-06, + "loss": 0.6923, + "mean_token_accuracy": 0.7738001495599747, + "num_tokens": 1100052091.0, + "step": 3826 + }, + { + "epoch": 1.3629563668744433, + "grad_norm": 0.4655694365501404, + "learning_rate": 1e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7622387111186981, + "num_tokens": 1100332344.0, + "step": 3827 + }, + { + "epoch": 1.363312555654497, + "grad_norm": 0.494800329208374, + "learning_rate": 1e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7607159316539764, + "num_tokens": 1100593751.0, + "step": 3828 + }, + { + "epoch": 1.3636687444345503, + "grad_norm": 0.4266279637813568, + "learning_rate": 1e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7641586661338806, + "num_tokens": 1100934014.0, + "step": 3829 + }, + { + "epoch": 1.3640249332146037, + "grad_norm": 0.4416760802268982, + "learning_rate": 1e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7790958434343338, + "num_tokens": 1101275400.0, + "step": 3830 + }, + { + "epoch": 1.364381121994657, + "grad_norm": 0.4327106475830078, + "learning_rate": 1e-06, + "loss": 0.6939, + "mean_token_accuracy": 0.7775000035762787, + "num_tokens": 1101603263.0, + "step": 3831 + }, + { + "epoch": 1.3647373107747107, + "grad_norm": 0.4775204658508301, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7564384192228317, + "num_tokens": 1101881463.0, + "step": 3832 + }, + { + "epoch": 1.3650934995547641, + "grad_norm": 0.4959547221660614, + "learning_rate": 1e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7740951329469681, + "num_tokens": 1102181465.0, + "step": 3833 + }, + { + "epoch": 1.3654496883348175, + "grad_norm": 0.4847791790962219, + "learning_rate": 1e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7559253871440887, + "num_tokens": 1102459532.0, + "step": 3834 + }, + { + "epoch": 1.365805877114871, + "grad_norm": 0.5422922968864441, + "learning_rate": 1e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7625986784696579, + "num_tokens": 1102727248.0, + "step": 3835 + }, + { + "epoch": 1.3661620658949243, + "grad_norm": 0.5413948893547058, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7617383599281311, + "num_tokens": 1103003299.0, + "step": 3836 + }, + { + "epoch": 1.3665182546749777, + "grad_norm": 0.49242135882377625, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7559038698673248, + "num_tokens": 1103279746.0, + "step": 3837 + }, + { + "epoch": 1.366874443455031, + "grad_norm": 0.5062355995178223, + "learning_rate": 1e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7539171874523163, + "num_tokens": 1103564166.0, + "step": 3838 + }, + { + "epoch": 1.3672306322350847, + "grad_norm": 0.46580395102500916, + "learning_rate": 1e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.7869965583086014, + "num_tokens": 1103853786.0, + "step": 3839 + }, + { + "epoch": 1.367586821015138, + "grad_norm": 0.4782000184059143, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7671618461608887, + "num_tokens": 1104174654.0, + "step": 3840 + }, + { + "epoch": 1.3679430097951915, + "grad_norm": 0.4953010380268097, + "learning_rate": 1e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7707157582044601, + "num_tokens": 1104451051.0, + "step": 3841 + }, + { + "epoch": 1.3682991985752448, + "grad_norm": 0.529767632484436, + "learning_rate": 1e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7446706295013428, + "num_tokens": 1104735439.0, + "step": 3842 + }, + { + "epoch": 1.3686553873552982, + "grad_norm": 0.529912531375885, + "learning_rate": 1e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7746766954660416, + "num_tokens": 1105037884.0, + "step": 3843 + }, + { + "epoch": 1.3690115761353518, + "grad_norm": 0.475727379322052, + "learning_rate": 1e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.7788875102996826, + "num_tokens": 1105324429.0, + "step": 3844 + }, + { + "epoch": 1.3693677649154052, + "grad_norm": 0.44270795583724976, + "learning_rate": 1e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7568662315607071, + "num_tokens": 1105586660.0, + "step": 3845 + }, + { + "epoch": 1.3697239536954586, + "grad_norm": 0.4179422855377197, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7667330056428909, + "num_tokens": 1105903833.0, + "step": 3846 + }, + { + "epoch": 1.370080142475512, + "grad_norm": 0.49094653129577637, + "learning_rate": 1e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7523146569728851, + "num_tokens": 1106210771.0, + "step": 3847 + }, + { + "epoch": 1.3704363312555654, + "grad_norm": 0.4570371210575104, + "learning_rate": 1e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7648274004459381, + "num_tokens": 1106527948.0, + "step": 3848 + }, + { + "epoch": 1.3707925200356188, + "grad_norm": 0.4735782742500305, + "learning_rate": 1e-06, + "loss": 0.739, + "mean_token_accuracy": 0.763228639960289, + "num_tokens": 1106813084.0, + "step": 3849 + }, + { + "epoch": 1.3711487088156722, + "grad_norm": 0.49654653668403625, + "learning_rate": 1e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7731381356716156, + "num_tokens": 1107093133.0, + "step": 3850 + }, + { + "epoch": 1.3715048975957258, + "grad_norm": 0.4414391815662384, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7490849792957306, + "num_tokens": 1107396687.0, + "step": 3851 + }, + { + "epoch": 1.3718610863757792, + "grad_norm": 0.46868783235549927, + "learning_rate": 1e-06, + "loss": 0.691, + "mean_token_accuracy": 0.7746726274490356, + "num_tokens": 1107691602.0, + "step": 3852 + }, + { + "epoch": 1.3722172751558326, + "grad_norm": 0.4310477077960968, + "learning_rate": 1e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7615796476602554, + "num_tokens": 1108045686.0, + "step": 3853 + }, + { + "epoch": 1.372573463935886, + "grad_norm": 0.4673134684562683, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7381841689348221, + "num_tokens": 1108333532.0, + "step": 3854 + }, + { + "epoch": 1.3729296527159396, + "grad_norm": 0.4737541377544403, + "learning_rate": 1e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7709866315126419, + "num_tokens": 1108615714.0, + "step": 3855 + }, + { + "epoch": 1.373285841495993, + "grad_norm": 0.435831218957901, + "learning_rate": 1e-06, + "loss": 0.6888, + "mean_token_accuracy": 0.7786143720149994, + "num_tokens": 1108931474.0, + "step": 3856 + }, + { + "epoch": 1.3736420302760464, + "grad_norm": 0.49179235100746155, + "learning_rate": 1e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7614181786775589, + "num_tokens": 1109193190.0, + "step": 3857 + }, + { + "epoch": 1.3739982190560998, + "grad_norm": 0.45652133226394653, + "learning_rate": 1e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7633533030748367, + "num_tokens": 1109500815.0, + "step": 3858 + }, + { + "epoch": 1.3743544078361531, + "grad_norm": 0.5397256016731262, + "learning_rate": 1e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7677814662456512, + "num_tokens": 1109775682.0, + "step": 3859 + }, + { + "epoch": 1.3747105966162065, + "grad_norm": 0.5039874315261841, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7636829912662506, + "num_tokens": 1110065763.0, + "step": 3860 + }, + { + "epoch": 1.37506678539626, + "grad_norm": 0.45935946702957153, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7668467611074448, + "num_tokens": 1110364024.0, + "step": 3861 + }, + { + "epoch": 1.3754229741763133, + "grad_norm": 0.5297581553459167, + "learning_rate": 1e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.783725380897522, + "num_tokens": 1110616062.0, + "step": 3862 + }, + { + "epoch": 1.375779162956367, + "grad_norm": 0.4641052782535553, + "learning_rate": 1e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7731194794178009, + "num_tokens": 1110894209.0, + "step": 3863 + }, + { + "epoch": 1.3761353517364203, + "grad_norm": 0.44283801317214966, + "learning_rate": 1e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7685575485229492, + "num_tokens": 1111182490.0, + "step": 3864 + }, + { + "epoch": 1.3764915405164737, + "grad_norm": 0.4836299419403076, + "learning_rate": 1e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7570240348577499, + "num_tokens": 1111471710.0, + "step": 3865 + }, + { + "epoch": 1.376847729296527, + "grad_norm": 0.4502122402191162, + "learning_rate": 1e-06, + "loss": 0.6555, + "mean_token_accuracy": 0.7872241139411926, + "num_tokens": 1111768838.0, + "step": 3866 + }, + { + "epoch": 1.3772039180765807, + "grad_norm": 0.49575966596603394, + "learning_rate": 1e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7746228873729706, + "num_tokens": 1112027715.0, + "step": 3867 + }, + { + "epoch": 1.377560106856634, + "grad_norm": 0.5120747685432434, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7617349028587341, + "num_tokens": 1112281994.0, + "step": 3868 + }, + { + "epoch": 1.3779162956366875, + "grad_norm": 0.48329025506973267, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7667637467384338, + "num_tokens": 1112578965.0, + "step": 3869 + }, + { + "epoch": 1.3782724844167409, + "grad_norm": 0.47897249460220337, + "learning_rate": 1e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.7835472226142883, + "num_tokens": 1112832921.0, + "step": 3870 + }, + { + "epoch": 1.3786286731967943, + "grad_norm": 0.4650370478630066, + "learning_rate": 1e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7603986114263535, + "num_tokens": 1113141373.0, + "step": 3871 + }, + { + "epoch": 1.3789848619768477, + "grad_norm": 0.47562772035598755, + "learning_rate": 1e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.758141040802002, + "num_tokens": 1113449403.0, + "step": 3872 + }, + { + "epoch": 1.379341050756901, + "grad_norm": 0.4750409424304962, + "learning_rate": 1e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7589818686246872, + "num_tokens": 1113742710.0, + "step": 3873 + }, + { + "epoch": 1.3796972395369547, + "grad_norm": 0.4434987008571625, + "learning_rate": 1e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7791968882083893, + "num_tokens": 1114046075.0, + "step": 3874 + }, + { + "epoch": 1.380053428317008, + "grad_norm": 0.464254766702652, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7730532139539719, + "num_tokens": 1114330725.0, + "step": 3875 + }, + { + "epoch": 1.3804096170970614, + "grad_norm": 0.4970637559890747, + "learning_rate": 1e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7718740701675415, + "num_tokens": 1114609914.0, + "step": 3876 + }, + { + "epoch": 1.3807658058771148, + "grad_norm": 0.48381978273391724, + "learning_rate": 1e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7645363658666611, + "num_tokens": 1114899887.0, + "step": 3877 + }, + { + "epoch": 1.3811219946571682, + "grad_norm": 0.4632703363895416, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7550147175788879, + "num_tokens": 1115191271.0, + "step": 3878 + }, + { + "epoch": 1.3814781834372218, + "grad_norm": 0.5016262531280518, + "learning_rate": 1e-06, + "loss": 0.6752, + "mean_token_accuracy": 0.7798347175121307, + "num_tokens": 1115458359.0, + "step": 3879 + }, + { + "epoch": 1.3818343722172752, + "grad_norm": 0.41614145040512085, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7679532915353775, + "num_tokens": 1115789406.0, + "step": 3880 + }, + { + "epoch": 1.3821905609973286, + "grad_norm": 0.4764646291732788, + "learning_rate": 1e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7441438883543015, + "num_tokens": 1116117006.0, + "step": 3881 + }, + { + "epoch": 1.382546749777382, + "grad_norm": 0.4373632073402405, + "learning_rate": 1e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7769571989774704, + "num_tokens": 1116397765.0, + "step": 3882 + }, + { + "epoch": 1.3829029385574354, + "grad_norm": 0.5134674906730652, + "learning_rate": 1e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7530969530344009, + "num_tokens": 1116683844.0, + "step": 3883 + }, + { + "epoch": 1.3832591273374888, + "grad_norm": 0.4700120687484741, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7636884450912476, + "num_tokens": 1116968539.0, + "step": 3884 + }, + { + "epoch": 1.3836153161175422, + "grad_norm": 0.47763684391975403, + "learning_rate": 1e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7476778924465179, + "num_tokens": 1117253988.0, + "step": 3885 + }, + { + "epoch": 1.3839715048975958, + "grad_norm": 0.4513624310493469, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7678414434194565, + "num_tokens": 1117568909.0, + "step": 3886 + }, + { + "epoch": 1.3843276936776492, + "grad_norm": 0.4517326056957245, + "learning_rate": 1e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7698521465063095, + "num_tokens": 1117888466.0, + "step": 3887 + }, + { + "epoch": 1.3846838824577026, + "grad_norm": 0.45662015676498413, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7578887939453125, + "num_tokens": 1118209135.0, + "step": 3888 + }, + { + "epoch": 1.385040071237756, + "grad_norm": 0.4775606095790863, + "learning_rate": 1e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.762445330619812, + "num_tokens": 1118523891.0, + "step": 3889 + }, + { + "epoch": 1.3853962600178096, + "grad_norm": 0.4402921795845032, + "learning_rate": 1e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7567353844642639, + "num_tokens": 1118843484.0, + "step": 3890 + }, + { + "epoch": 1.385752448797863, + "grad_norm": 0.5179701447486877, + "learning_rate": 1e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7567522674798965, + "num_tokens": 1119102195.0, + "step": 3891 + }, + { + "epoch": 1.3861086375779164, + "grad_norm": 0.420003741979599, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.764927327632904, + "num_tokens": 1119435109.0, + "step": 3892 + }, + { + "epoch": 1.3864648263579697, + "grad_norm": 0.43318939208984375, + "learning_rate": 1e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7492047101259232, + "num_tokens": 1119774949.0, + "step": 3893 + }, + { + "epoch": 1.3868210151380231, + "grad_norm": 0.4959756135940552, + "learning_rate": 1e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7631511241197586, + "num_tokens": 1120031700.0, + "step": 3894 + }, + { + "epoch": 1.3871772039180765, + "grad_norm": 0.5325773358345032, + "learning_rate": 1e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7635435312986374, + "num_tokens": 1120291576.0, + "step": 3895 + }, + { + "epoch": 1.38753339269813, + "grad_norm": 0.523356020450592, + "learning_rate": 1e-06, + "loss": 0.6873, + "mean_token_accuracy": 0.7765623927116394, + "num_tokens": 1120558951.0, + "step": 3896 + }, + { + "epoch": 1.3878895814781833, + "grad_norm": 0.5550752282142639, + "learning_rate": 1e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.7668813616037369, + "num_tokens": 1120815611.0, + "step": 3897 + }, + { + "epoch": 1.388245770258237, + "grad_norm": 0.46620500087738037, + "learning_rate": 1e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.7855800539255142, + "num_tokens": 1121133854.0, + "step": 3898 + }, + { + "epoch": 1.3886019590382903, + "grad_norm": 0.49373069405555725, + "learning_rate": 1e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7614671289920807, + "num_tokens": 1121414231.0, + "step": 3899 + }, + { + "epoch": 1.3889581478183437, + "grad_norm": 0.4449290633201599, + "learning_rate": 1e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7748429328203201, + "num_tokens": 1121703294.0, + "step": 3900 + }, + { + "epoch": 1.389314336598397, + "grad_norm": 0.519832193851471, + "learning_rate": 1e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7424060702323914, + "num_tokens": 1121965756.0, + "step": 3901 + }, + { + "epoch": 1.3896705253784507, + "grad_norm": 0.5655461549758911, + "learning_rate": 1e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7648443579673767, + "num_tokens": 1122212309.0, + "step": 3902 + }, + { + "epoch": 1.390026714158504, + "grad_norm": 0.4326593279838562, + "learning_rate": 1e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7696726769208908, + "num_tokens": 1122514527.0, + "step": 3903 + }, + { + "epoch": 1.3903829029385575, + "grad_norm": 0.4861949384212494, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7717146426439285, + "num_tokens": 1122782984.0, + "step": 3904 + }, + { + "epoch": 1.3907390917186109, + "grad_norm": 0.4974902272224426, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7558081746101379, + "num_tokens": 1123049401.0, + "step": 3905 + }, + { + "epoch": 1.3910952804986643, + "grad_norm": 0.48644542694091797, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7600300759077072, + "num_tokens": 1123335153.0, + "step": 3906 + }, + { + "epoch": 1.3914514692787177, + "grad_norm": 0.5294650793075562, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7598960399627686, + "num_tokens": 1123621097.0, + "step": 3907 + }, + { + "epoch": 1.391807658058771, + "grad_norm": 0.4298155605792999, + "learning_rate": 1e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7660166472196579, + "num_tokens": 1123940423.0, + "step": 3908 + }, + { + "epoch": 1.3921638468388247, + "grad_norm": 0.5100908279418945, + "learning_rate": 1e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7622732520103455, + "num_tokens": 1124208576.0, + "step": 3909 + }, + { + "epoch": 1.392520035618878, + "grad_norm": 0.47515028715133667, + "learning_rate": 1e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.778775691986084, + "num_tokens": 1124491902.0, + "step": 3910 + }, + { + "epoch": 1.3928762243989314, + "grad_norm": 0.4543716609477997, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.767855241894722, + "num_tokens": 1124783597.0, + "step": 3911 + }, + { + "epoch": 1.3932324131789848, + "grad_norm": 0.5178055763244629, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7625745087862015, + "num_tokens": 1125031235.0, + "step": 3912 + }, + { + "epoch": 1.3935886019590382, + "grad_norm": 0.43997013568878174, + "learning_rate": 1e-06, + "loss": 0.7303, + "mean_token_accuracy": 0.7709880918264389, + "num_tokens": 1125348746.0, + "step": 3913 + }, + { + "epoch": 1.3939447907390918, + "grad_norm": 0.44989097118377686, + "learning_rate": 1e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7616328150033951, + "num_tokens": 1125663442.0, + "step": 3914 + }, + { + "epoch": 1.3943009795191452, + "grad_norm": 0.441777765750885, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7713215500116348, + "num_tokens": 1125966739.0, + "step": 3915 + }, + { + "epoch": 1.3946571682991986, + "grad_norm": 0.4747224748134613, + "learning_rate": 1e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7657105922698975, + "num_tokens": 1126265074.0, + "step": 3916 + }, + { + "epoch": 1.395013357079252, + "grad_norm": 0.42717912793159485, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7555294036865234, + "num_tokens": 1126538825.0, + "step": 3917 + }, + { + "epoch": 1.3953695458593054, + "grad_norm": 0.48150989413261414, + "learning_rate": 1e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7478718757629395, + "num_tokens": 1126797685.0, + "step": 3918 + }, + { + "epoch": 1.3957257346393588, + "grad_norm": 0.46268293261528015, + "learning_rate": 1e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.757643073797226, + "num_tokens": 1127086776.0, + "step": 3919 + }, + { + "epoch": 1.3960819234194122, + "grad_norm": 0.4407429099082947, + "learning_rate": 1e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7712440937757492, + "num_tokens": 1127404295.0, + "step": 3920 + }, + { + "epoch": 1.3964381121994658, + "grad_norm": 0.5062964558601379, + "learning_rate": 1e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7608007490634918, + "num_tokens": 1127668750.0, + "step": 3921 + }, + { + "epoch": 1.3967943009795192, + "grad_norm": 0.4872991740703583, + "learning_rate": 1e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.752576544880867, + "num_tokens": 1127956818.0, + "step": 3922 + }, + { + "epoch": 1.3971504897595726, + "grad_norm": 0.4891975224018097, + "learning_rate": 1e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7650969326496124, + "num_tokens": 1128221304.0, + "step": 3923 + }, + { + "epoch": 1.397506678539626, + "grad_norm": 0.43456003069877625, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7714370638132095, + "num_tokens": 1128521294.0, + "step": 3924 + }, + { + "epoch": 1.3978628673196793, + "grad_norm": 0.4816884994506836, + "learning_rate": 1e-06, + "loss": 0.768, + "mean_token_accuracy": 0.755986750125885, + "num_tokens": 1128811308.0, + "step": 3925 + }, + { + "epoch": 1.398219056099733, + "grad_norm": 0.4681158661842346, + "learning_rate": 1e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7544840276241302, + "num_tokens": 1129120570.0, + "step": 3926 + }, + { + "epoch": 1.3985752448797864, + "grad_norm": 0.49334219098091125, + "learning_rate": 1e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7692114114761353, + "num_tokens": 1129400652.0, + "step": 3927 + }, + { + "epoch": 1.3989314336598397, + "grad_norm": 0.4735408425331116, + "learning_rate": 1e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7572815418243408, + "num_tokens": 1129693789.0, + "step": 3928 + }, + { + "epoch": 1.3992876224398931, + "grad_norm": 0.5269855260848999, + "learning_rate": 1e-06, + "loss": 0.711, + "mean_token_accuracy": 0.7736298590898514, + "num_tokens": 1129954751.0, + "step": 3929 + }, + { + "epoch": 1.3996438112199465, + "grad_norm": 0.4651608467102051, + "learning_rate": 1e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.7642545849084854, + "num_tokens": 1130247613.0, + "step": 3930 + }, + { + "epoch": 1.4, + "grad_norm": 0.46380937099456787, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7540518492460251, + "num_tokens": 1130549382.0, + "step": 3931 + }, + { + "epoch": 1.4003561887800533, + "grad_norm": 0.43409910798072815, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7478373795747757, + "num_tokens": 1130828716.0, + "step": 3932 + }, + { + "epoch": 1.400712377560107, + "grad_norm": 0.5552263855934143, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7553455978631973, + "num_tokens": 1131068493.0, + "step": 3933 + }, + { + "epoch": 1.4010685663401603, + "grad_norm": 0.4967266321182251, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7428607940673828, + "num_tokens": 1131356477.0, + "step": 3934 + }, + { + "epoch": 1.4014247551202137, + "grad_norm": 0.4475145936012268, + "learning_rate": 1e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7757056504487991, + "num_tokens": 1131696184.0, + "step": 3935 + }, + { + "epoch": 1.401780943900267, + "grad_norm": 0.4462585747241974, + "learning_rate": 1e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7629405707120895, + "num_tokens": 1131983561.0, + "step": 3936 + }, + { + "epoch": 1.4021371326803207, + "grad_norm": 0.5081179141998291, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7623084783554077, + "num_tokens": 1132254510.0, + "step": 3937 + }, + { + "epoch": 1.402493321460374, + "grad_norm": 0.4772460162639618, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7619583457708359, + "num_tokens": 1132564072.0, + "step": 3938 + }, + { + "epoch": 1.4028495102404275, + "grad_norm": 0.46472471952438354, + "learning_rate": 1e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.763838529586792, + "num_tokens": 1132895986.0, + "step": 3939 + }, + { + "epoch": 1.4032056990204809, + "grad_norm": 0.46154603362083435, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7601586729288101, + "num_tokens": 1133180308.0, + "step": 3940 + }, + { + "epoch": 1.4035618878005343, + "grad_norm": 0.4829665422439575, + "learning_rate": 1e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7709434032440186, + "num_tokens": 1133458458.0, + "step": 3941 + }, + { + "epoch": 1.4039180765805876, + "grad_norm": 0.4679628610610962, + "learning_rate": 1e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7591865211725235, + "num_tokens": 1133718671.0, + "step": 3942 + }, + { + "epoch": 1.404274265360641, + "grad_norm": 0.47871828079223633, + "learning_rate": 1e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7621115893125534, + "num_tokens": 1133991943.0, + "step": 3943 + }, + { + "epoch": 1.4046304541406947, + "grad_norm": 0.4942134916782379, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7717945575714111, + "num_tokens": 1134271212.0, + "step": 3944 + }, + { + "epoch": 1.404986642920748, + "grad_norm": 0.445044606924057, + "learning_rate": 1e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7644653618335724, + "num_tokens": 1134537576.0, + "step": 3945 + }, + { + "epoch": 1.4053428317008014, + "grad_norm": 0.43760138750076294, + "learning_rate": 1e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7577853351831436, + "num_tokens": 1134827572.0, + "step": 3946 + }, + { + "epoch": 1.4056990204808548, + "grad_norm": 0.46368011832237244, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.758720189332962, + "num_tokens": 1135129529.0, + "step": 3947 + }, + { + "epoch": 1.4060552092609082, + "grad_norm": 0.47741225361824036, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7569336742162704, + "num_tokens": 1135423020.0, + "step": 3948 + }, + { + "epoch": 1.4064113980409618, + "grad_norm": 0.49750396609306335, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7548538446426392, + "num_tokens": 1135689725.0, + "step": 3949 + }, + { + "epoch": 1.4067675868210152, + "grad_norm": 0.492095410823822, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7571258991956711, + "num_tokens": 1135973245.0, + "step": 3950 + }, + { + "epoch": 1.4071237756010686, + "grad_norm": 0.47167903184890747, + "learning_rate": 1e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.7691723108291626, + "num_tokens": 1136277233.0, + "step": 3951 + }, + { + "epoch": 1.407479964381122, + "grad_norm": 0.4542835056781769, + "learning_rate": 1e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7803571373224258, + "num_tokens": 1136570065.0, + "step": 3952 + }, + { + "epoch": 1.4078361531611754, + "grad_norm": 0.47197258472442627, + "learning_rate": 1e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7783266603946686, + "num_tokens": 1136819263.0, + "step": 3953 + }, + { + "epoch": 1.4081923419412288, + "grad_norm": 0.47735437750816345, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7576543092727661, + "num_tokens": 1137089540.0, + "step": 3954 + }, + { + "epoch": 1.4085485307212822, + "grad_norm": 0.4957239627838135, + "learning_rate": 1e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7677265107631683, + "num_tokens": 1137380459.0, + "step": 3955 + }, + { + "epoch": 1.4089047195013358, + "grad_norm": 0.5339805483818054, + "learning_rate": 1e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.754766657948494, + "num_tokens": 1137662103.0, + "step": 3956 + }, + { + "epoch": 1.4092609082813892, + "grad_norm": 0.4926389157772064, + "learning_rate": 1e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7664674669504166, + "num_tokens": 1137934953.0, + "step": 3957 + }, + { + "epoch": 1.4096170970614426, + "grad_norm": 0.4218841791152954, + "learning_rate": 1e-06, + "loss": 0.6722, + "mean_token_accuracy": 0.7773166596889496, + "num_tokens": 1138213180.0, + "step": 3958 + }, + { + "epoch": 1.409973285841496, + "grad_norm": 0.521086573600769, + "learning_rate": 1e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7739784568548203, + "num_tokens": 1138477273.0, + "step": 3959 + }, + { + "epoch": 1.4103294746215493, + "grad_norm": 0.43363156914711, + "learning_rate": 1e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7430787980556488, + "num_tokens": 1138761530.0, + "step": 3960 + }, + { + "epoch": 1.410685663401603, + "grad_norm": 0.5077871084213257, + "learning_rate": 1e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7589228451251984, + "num_tokens": 1139019605.0, + "step": 3961 + }, + { + "epoch": 1.4110418521816563, + "grad_norm": 0.498086541891098, + "learning_rate": 1e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.7499769181013107, + "num_tokens": 1139297334.0, + "step": 3962 + }, + { + "epoch": 1.4113980409617097, + "grad_norm": 0.4939715266227722, + "learning_rate": 1e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.765462651848793, + "num_tokens": 1139586893.0, + "step": 3963 + }, + { + "epoch": 1.4117542297417631, + "grad_norm": 0.49570322036743164, + "learning_rate": 1e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7725346982479095, + "num_tokens": 1139853025.0, + "step": 3964 + }, + { + "epoch": 1.4121104185218165, + "grad_norm": 0.4868699610233307, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.76202192902565, + "num_tokens": 1140135385.0, + "step": 3965 + }, + { + "epoch": 1.41246660730187, + "grad_norm": 0.444416880607605, + "learning_rate": 1e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.760023906826973, + "num_tokens": 1140454740.0, + "step": 3966 + }, + { + "epoch": 1.4128227960819233, + "grad_norm": 0.5216339230537415, + "learning_rate": 1e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7752584517002106, + "num_tokens": 1140737575.0, + "step": 3967 + }, + { + "epoch": 1.413178984861977, + "grad_norm": 0.46814092993736267, + "learning_rate": 1e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7675231397151947, + "num_tokens": 1141026372.0, + "step": 3968 + }, + { + "epoch": 1.4135351736420303, + "grad_norm": 0.48615747690200806, + "learning_rate": 1e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7764976173639297, + "num_tokens": 1141267606.0, + "step": 3969 + }, + { + "epoch": 1.4138913624220837, + "grad_norm": 0.44442689418792725, + "learning_rate": 1e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7576925754547119, + "num_tokens": 1141596993.0, + "step": 3970 + }, + { + "epoch": 1.414247551202137, + "grad_norm": 0.5279011726379395, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7640853524208069, + "num_tokens": 1141860226.0, + "step": 3971 + }, + { + "epoch": 1.4146037399821907, + "grad_norm": 0.46032190322875977, + "learning_rate": 1e-06, + "loss": 0.645, + "mean_token_accuracy": 0.7905533760786057, + "num_tokens": 1142145922.0, + "step": 3972 + }, + { + "epoch": 1.414959928762244, + "grad_norm": 0.4543444812297821, + "learning_rate": 1e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.7778663784265518, + "num_tokens": 1142433909.0, + "step": 3973 + }, + { + "epoch": 1.4153161175422975, + "grad_norm": 0.5090624690055847, + "learning_rate": 1e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7546795904636383, + "num_tokens": 1142740651.0, + "step": 3974 + }, + { + "epoch": 1.4156723063223509, + "grad_norm": 0.4601905941963196, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7581978738307953, + "num_tokens": 1143028138.0, + "step": 3975 + }, + { + "epoch": 1.4160284951024042, + "grad_norm": 0.4707123041152954, + "learning_rate": 1e-06, + "loss": 0.6865, + "mean_token_accuracy": 0.7785085290670395, + "num_tokens": 1143307390.0, + "step": 3976 + }, + { + "epoch": 1.4163846838824576, + "grad_norm": 0.45347392559051514, + "learning_rate": 1e-06, + "loss": 0.678, + "mean_token_accuracy": 0.7821685373783112, + "num_tokens": 1143601291.0, + "step": 3977 + }, + { + "epoch": 1.416740872662511, + "grad_norm": 0.44670185446739197, + "learning_rate": 1e-06, + "loss": 0.6491, + "mean_token_accuracy": 0.7886654138565063, + "num_tokens": 1143898931.0, + "step": 3978 + }, + { + "epoch": 1.4170970614425644, + "grad_norm": 0.4998572766780853, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7714771777391434, + "num_tokens": 1144173346.0, + "step": 3979 + }, + { + "epoch": 1.417453250222618, + "grad_norm": 0.47771695256233215, + "learning_rate": 1e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7717580944299698, + "num_tokens": 1144456728.0, + "step": 3980 + }, + { + "epoch": 1.4178094390026714, + "grad_norm": 0.4812077581882477, + "learning_rate": 1e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7569497525691986, + "num_tokens": 1144729375.0, + "step": 3981 + }, + { + "epoch": 1.4181656277827248, + "grad_norm": 0.4479964077472687, + "learning_rate": 1e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.780487060546875, + "num_tokens": 1145007839.0, + "step": 3982 + }, + { + "epoch": 1.4185218165627782, + "grad_norm": 0.45786601305007935, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.741658940911293, + "num_tokens": 1145305584.0, + "step": 3983 + }, + { + "epoch": 1.4188780053428318, + "grad_norm": 0.4467996656894684, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.7623285353183746, + "num_tokens": 1145615461.0, + "step": 3984 + }, + { + "epoch": 1.4192341941228852, + "grad_norm": 0.4866165220737457, + "learning_rate": 1e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7675540745258331, + "num_tokens": 1145903704.0, + "step": 3985 + }, + { + "epoch": 1.4195903829029386, + "grad_norm": 0.626785159111023, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7494523525238037, + "num_tokens": 1146160497.0, + "step": 3986 + }, + { + "epoch": 1.419946571682992, + "grad_norm": 0.4939584732055664, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7532465308904648, + "num_tokens": 1146443951.0, + "step": 3987 + }, + { + "epoch": 1.4203027604630454, + "grad_norm": 0.4804207384586334, + "learning_rate": 1e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7790029048919678, + "num_tokens": 1146717627.0, + "step": 3988 + }, + { + "epoch": 1.4206589492430988, + "grad_norm": 0.5140586495399475, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7526099681854248, + "num_tokens": 1146981678.0, + "step": 3989 + }, + { + "epoch": 1.4210151380231522, + "grad_norm": 0.43308061361312866, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7690277099609375, + "num_tokens": 1147307626.0, + "step": 3990 + }, + { + "epoch": 1.4213713268032058, + "grad_norm": 0.4745728075504303, + "learning_rate": 1e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7608237415552139, + "num_tokens": 1147630756.0, + "step": 3991 + }, + { + "epoch": 1.4217275155832592, + "grad_norm": 0.5275938510894775, + "learning_rate": 1e-06, + "loss": 0.7952, + "mean_token_accuracy": 0.7518674284219742, + "num_tokens": 1147938791.0, + "step": 3992 + }, + { + "epoch": 1.4220837043633126, + "grad_norm": 0.47392693161964417, + "learning_rate": 1e-06, + "loss": 0.773, + "mean_token_accuracy": 0.763064980506897, + "num_tokens": 1148225482.0, + "step": 3993 + }, + { + "epoch": 1.422439893143366, + "grad_norm": 0.5101982951164246, + "learning_rate": 1e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7446881830692291, + "num_tokens": 1148491252.0, + "step": 3994 + }, + { + "epoch": 1.4227960819234193, + "grad_norm": 0.4582325518131256, + "learning_rate": 1e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7700461745262146, + "num_tokens": 1148787865.0, + "step": 3995 + }, + { + "epoch": 1.423152270703473, + "grad_norm": 0.4292067885398865, + "learning_rate": 1e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7693280428647995, + "num_tokens": 1149096120.0, + "step": 3996 + }, + { + "epoch": 1.4235084594835263, + "grad_norm": 0.4419688284397125, + "learning_rate": 1e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7752479016780853, + "num_tokens": 1149394290.0, + "step": 3997 + }, + { + "epoch": 1.4238646482635797, + "grad_norm": 0.5344378352165222, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7570342868566513, + "num_tokens": 1149677896.0, + "step": 3998 + }, + { + "epoch": 1.4242208370436331, + "grad_norm": 0.5185781717300415, + "learning_rate": 1e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7689642310142517, + "num_tokens": 1149967057.0, + "step": 3999 + }, + { + "epoch": 1.4245770258236865, + "grad_norm": 0.5038827657699585, + "learning_rate": 1e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7599547356367111, + "num_tokens": 1150242036.0, + "step": 4000 + }, + { + "epoch": 1.42493321460374, + "grad_norm": 0.49345168471336365, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7641026079654694, + "num_tokens": 1150524346.0, + "step": 4001 + }, + { + "epoch": 1.4252894033837933, + "grad_norm": 0.42538049817085266, + "learning_rate": 1e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.75771763920784, + "num_tokens": 1150847067.0, + "step": 4002 + }, + { + "epoch": 1.425645592163847, + "grad_norm": 0.481698602437973, + "learning_rate": 1e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7808360904455185, + "num_tokens": 1151138581.0, + "step": 4003 + }, + { + "epoch": 1.4260017809439003, + "grad_norm": 0.4305541515350342, + "learning_rate": 1e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7660840451717377, + "num_tokens": 1151440428.0, + "step": 4004 + }, + { + "epoch": 1.4263579697239537, + "grad_norm": 0.434798926115036, + "learning_rate": 1e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.7547928690910339, + "num_tokens": 1151787341.0, + "step": 4005 + }, + { + "epoch": 1.426714158504007, + "grad_norm": 0.4670140743255615, + "learning_rate": 1e-06, + "loss": 0.6767, + "mean_token_accuracy": 0.783871591091156, + "num_tokens": 1152061628.0, + "step": 4006 + }, + { + "epoch": 1.4270703472840607, + "grad_norm": 0.4352364242076874, + "learning_rate": 1e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7688679248094559, + "num_tokens": 1152354205.0, + "step": 4007 + }, + { + "epoch": 1.427426536064114, + "grad_norm": 0.5214498043060303, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7641398012638092, + "num_tokens": 1152605622.0, + "step": 4008 + }, + { + "epoch": 1.4277827248441675, + "grad_norm": 0.4120498299598694, + "learning_rate": 1e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.769210696220398, + "num_tokens": 1152927150.0, + "step": 4009 + }, + { + "epoch": 1.4281389136242209, + "grad_norm": 0.4768071174621582, + "learning_rate": 1e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.7725305110216141, + "num_tokens": 1153225723.0, + "step": 4010 + }, + { + "epoch": 1.4284951024042742, + "grad_norm": 0.44473007321357727, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7689938545227051, + "num_tokens": 1153522889.0, + "step": 4011 + }, + { + "epoch": 1.4288512911843276, + "grad_norm": 0.45670533180236816, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7727043330669403, + "num_tokens": 1153810802.0, + "step": 4012 + }, + { + "epoch": 1.429207479964381, + "grad_norm": 0.48265746235847473, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7598002701997757, + "num_tokens": 1154088269.0, + "step": 4013 + }, + { + "epoch": 1.4295636687444344, + "grad_norm": 0.5061374306678772, + "learning_rate": 1e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7563665509223938, + "num_tokens": 1154366014.0, + "step": 4014 + }, + { + "epoch": 1.429919857524488, + "grad_norm": 0.5082041025161743, + "learning_rate": 1e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7470484226942062, + "num_tokens": 1154643823.0, + "step": 4015 + }, + { + "epoch": 1.4302760463045414, + "grad_norm": 0.4713299572467804, + "learning_rate": 1e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.767925500869751, + "num_tokens": 1154940166.0, + "step": 4016 + }, + { + "epoch": 1.4306322350845948, + "grad_norm": 0.4416373670101166, + "learning_rate": 1e-06, + "loss": 0.8002, + "mean_token_accuracy": 0.7501967549324036, + "num_tokens": 1155239372.0, + "step": 4017 + }, + { + "epoch": 1.4309884238646482, + "grad_norm": 0.4438949525356293, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7683965116739273, + "num_tokens": 1155549560.0, + "step": 4018 + }, + { + "epoch": 1.4313446126447018, + "grad_norm": 0.5037242770195007, + "learning_rate": 1e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7687532156705856, + "num_tokens": 1155826763.0, + "step": 4019 + }, + { + "epoch": 1.4317008014247552, + "grad_norm": 0.48069870471954346, + "learning_rate": 1e-06, + "loss": 0.6469, + "mean_token_accuracy": 0.7850608825683594, + "num_tokens": 1156112910.0, + "step": 4020 + }, + { + "epoch": 1.4320569902048086, + "grad_norm": 0.48414573073387146, + "learning_rate": 1e-06, + "loss": 0.7266, + "mean_token_accuracy": 0.764165073633194, + "num_tokens": 1156380376.0, + "step": 4021 + }, + { + "epoch": 1.432413178984862, + "grad_norm": 0.5096988677978516, + "learning_rate": 1e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7496588379144669, + "num_tokens": 1156644616.0, + "step": 4022 + }, + { + "epoch": 1.4327693677649154, + "grad_norm": 0.4716635048389435, + "learning_rate": 1e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7593184411525726, + "num_tokens": 1156917341.0, + "step": 4023 + }, + { + "epoch": 1.4331255565449688, + "grad_norm": 0.5232839584350586, + "learning_rate": 1e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7605411559343338, + "num_tokens": 1157181437.0, + "step": 4024 + }, + { + "epoch": 1.4334817453250221, + "grad_norm": 0.5023995637893677, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7677745074033737, + "num_tokens": 1157469254.0, + "step": 4025 + }, + { + "epoch": 1.4338379341050758, + "grad_norm": 0.5508496165275574, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7558749318122864, + "num_tokens": 1157716367.0, + "step": 4026 + }, + { + "epoch": 1.4341941228851292, + "grad_norm": 0.46042388677597046, + "learning_rate": 1e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7622412443161011, + "num_tokens": 1157985071.0, + "step": 4027 + }, + { + "epoch": 1.4345503116651825, + "grad_norm": 0.46362122893333435, + "learning_rate": 1e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7655509263277054, + "num_tokens": 1158277714.0, + "step": 4028 + }, + { + "epoch": 1.434906500445236, + "grad_norm": 0.46097439527511597, + "learning_rate": 1e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7643224895000458, + "num_tokens": 1158535763.0, + "step": 4029 + }, + { + "epoch": 1.4352626892252893, + "grad_norm": 0.4917868971824646, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7681357264518738, + "num_tokens": 1158851878.0, + "step": 4030 + }, + { + "epoch": 1.435618878005343, + "grad_norm": 0.4714905917644501, + "learning_rate": 1e-06, + "loss": 0.6633, + "mean_token_accuracy": 0.7901238054037094, + "num_tokens": 1159125392.0, + "step": 4031 + }, + { + "epoch": 1.4359750667853963, + "grad_norm": 0.47578513622283936, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7665429413318634, + "num_tokens": 1159401658.0, + "step": 4032 + }, + { + "epoch": 1.4363312555654497, + "grad_norm": 0.4728788137435913, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7505989968776703, + "num_tokens": 1159703554.0, + "step": 4033 + }, + { + "epoch": 1.436687444345503, + "grad_norm": 0.4741925597190857, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7485165297985077, + "num_tokens": 1159984876.0, + "step": 4034 + }, + { + "epoch": 1.4370436331255565, + "grad_norm": 0.4654526710510254, + "learning_rate": 1e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7772190719842911, + "num_tokens": 1160285308.0, + "step": 4035 + }, + { + "epoch": 1.4373998219056099, + "grad_norm": 0.4779580533504486, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7429929971694946, + "num_tokens": 1160555506.0, + "step": 4036 + }, + { + "epoch": 1.4377560106856633, + "grad_norm": 0.5000224709510803, + "learning_rate": 1e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.769132599234581, + "num_tokens": 1160818933.0, + "step": 4037 + }, + { + "epoch": 1.438112199465717, + "grad_norm": 0.45997193455696106, + "learning_rate": 1e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7636625021696091, + "num_tokens": 1161108680.0, + "step": 4038 + }, + { + "epoch": 1.4384683882457703, + "grad_norm": 0.4777933359146118, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7713631391525269, + "num_tokens": 1161370864.0, + "step": 4039 + }, + { + "epoch": 1.4388245770258237, + "grad_norm": 0.4704669713973999, + "learning_rate": 1e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.780191570520401, + "num_tokens": 1161636353.0, + "step": 4040 + }, + { + "epoch": 1.439180765805877, + "grad_norm": 0.5309433937072754, + "learning_rate": 1e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7654053121805191, + "num_tokens": 1161890369.0, + "step": 4041 + }, + { + "epoch": 1.4395369545859307, + "grad_norm": 0.42638278007507324, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7685212045907974, + "num_tokens": 1162207179.0, + "step": 4042 + }, + { + "epoch": 1.439893143365984, + "grad_norm": 0.49432113766670227, + "learning_rate": 1e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7569348961114883, + "num_tokens": 1162469556.0, + "step": 4043 + }, + { + "epoch": 1.4402493321460375, + "grad_norm": 0.5290936827659607, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7332432121038437, + "num_tokens": 1162732964.0, + "step": 4044 + }, + { + "epoch": 1.4406055209260908, + "grad_norm": 0.4707391858100891, + "learning_rate": 1e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.777977779507637, + "num_tokens": 1163008422.0, + "step": 4045 + }, + { + "epoch": 1.4409617097061442, + "grad_norm": 0.539267897605896, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.766224279999733, + "num_tokens": 1163264478.0, + "step": 4046 + }, + { + "epoch": 1.4413178984861976, + "grad_norm": 0.5236873030662537, + "learning_rate": 1e-06, + "loss": 0.7747, + "mean_token_accuracy": 0.7561894804239273, + "num_tokens": 1163536487.0, + "step": 4047 + }, + { + "epoch": 1.441674087266251, + "grad_norm": 0.48416993021965027, + "learning_rate": 1e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7791179269552231, + "num_tokens": 1163818944.0, + "step": 4048 + }, + { + "epoch": 1.4420302760463044, + "grad_norm": 0.4769132733345032, + "learning_rate": 1e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.7579937726259232, + "num_tokens": 1164087153.0, + "step": 4049 + }, + { + "epoch": 1.442386464826358, + "grad_norm": 0.47986945509910583, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7618990689516068, + "num_tokens": 1164386267.0, + "step": 4050 + }, + { + "epoch": 1.4427426536064114, + "grad_norm": 0.4819086492061615, + "learning_rate": 1e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7704657912254333, + "num_tokens": 1164683507.0, + "step": 4051 + }, + { + "epoch": 1.4430988423864648, + "grad_norm": 0.4263809025287628, + "learning_rate": 1e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7484875619411469, + "num_tokens": 1165010538.0, + "step": 4052 + }, + { + "epoch": 1.4434550311665182, + "grad_norm": 0.4806290864944458, + "learning_rate": 1e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7712689936161041, + "num_tokens": 1165297457.0, + "step": 4053 + }, + { + "epoch": 1.4438112199465718, + "grad_norm": 0.4768887460231781, + "learning_rate": 1e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7623035609722137, + "num_tokens": 1165580536.0, + "step": 4054 + }, + { + "epoch": 1.4441674087266252, + "grad_norm": 0.4363545775413513, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7658236026763916, + "num_tokens": 1165883101.0, + "step": 4055 + }, + { + "epoch": 1.4445235975066786, + "grad_norm": 0.47100889682769775, + "learning_rate": 1e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.753154382109642, + "num_tokens": 1166190094.0, + "step": 4056 + }, + { + "epoch": 1.444879786286732, + "grad_norm": 0.4123469889163971, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7695693075656891, + "num_tokens": 1166492173.0, + "step": 4057 + }, + { + "epoch": 1.4452359750667854, + "grad_norm": 0.4429605007171631, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7509191781282425, + "num_tokens": 1166751846.0, + "step": 4058 + }, + { + "epoch": 1.4455921638468388, + "grad_norm": 0.5016980171203613, + "learning_rate": 1e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7656794488430023, + "num_tokens": 1167021641.0, + "step": 4059 + }, + { + "epoch": 1.4459483526268921, + "grad_norm": 0.46789830923080444, + "learning_rate": 1e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7641547471284866, + "num_tokens": 1167322030.0, + "step": 4060 + }, + { + "epoch": 1.4463045414069458, + "grad_norm": 0.46385541558265686, + "learning_rate": 1e-06, + "loss": 0.6828, + "mean_token_accuracy": 0.7831814587116241, + "num_tokens": 1167611004.0, + "step": 4061 + }, + { + "epoch": 1.4466607301869991, + "grad_norm": 0.4678962528705597, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7603039592504501, + "num_tokens": 1167934188.0, + "step": 4062 + }, + { + "epoch": 1.4470169189670525, + "grad_norm": 0.5137543082237244, + "learning_rate": 1e-06, + "loss": 0.6766, + "mean_token_accuracy": 0.780903160572052, + "num_tokens": 1168207885.0, + "step": 4063 + }, + { + "epoch": 1.447373107747106, + "grad_norm": 0.4835459291934967, + "learning_rate": 1e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7630725502967834, + "num_tokens": 1168498542.0, + "step": 4064 + }, + { + "epoch": 1.4477292965271593, + "grad_norm": 0.46278145909309387, + "learning_rate": 1e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7715128064155579, + "num_tokens": 1168765571.0, + "step": 4065 + }, + { + "epoch": 1.448085485307213, + "grad_norm": 0.4623907804489136, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.769237756729126, + "num_tokens": 1169081323.0, + "step": 4066 + }, + { + "epoch": 1.4484416740872663, + "grad_norm": 0.5038986802101135, + "learning_rate": 1e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7767871469259262, + "num_tokens": 1169371529.0, + "step": 4067 + }, + { + "epoch": 1.4487978628673197, + "grad_norm": 0.47566714882850647, + "learning_rate": 1e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.7714390605688095, + "num_tokens": 1169688845.0, + "step": 4068 + }, + { + "epoch": 1.449154051647373, + "grad_norm": 0.46275851130485535, + "learning_rate": 1e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7660961300134659, + "num_tokens": 1169968078.0, + "step": 4069 + }, + { + "epoch": 1.4495102404274265, + "grad_norm": 0.4030289649963379, + "learning_rate": 1e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.7815694212913513, + "num_tokens": 1170281228.0, + "step": 4070 + }, + { + "epoch": 1.4498664292074799, + "grad_norm": 0.5000610947608948, + "learning_rate": 1e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7523266077041626, + "num_tokens": 1170572292.0, + "step": 4071 + }, + { + "epoch": 1.4502226179875333, + "grad_norm": 0.48695725202560425, + "learning_rate": 1e-06, + "loss": 0.689, + "mean_token_accuracy": 0.7795142978429794, + "num_tokens": 1170860888.0, + "step": 4072 + }, + { + "epoch": 1.4505788067675869, + "grad_norm": 0.4830891788005829, + "learning_rate": 1e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.7663684487342834, + "num_tokens": 1171150701.0, + "step": 4073 + }, + { + "epoch": 1.4509349955476403, + "grad_norm": 0.48554396629333496, + "learning_rate": 1e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7693840861320496, + "num_tokens": 1171441819.0, + "step": 4074 + }, + { + "epoch": 1.4512911843276937, + "grad_norm": 0.46453410387039185, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7730983048677444, + "num_tokens": 1171717927.0, + "step": 4075 + }, + { + "epoch": 1.451647373107747, + "grad_norm": 0.46648624539375305, + "learning_rate": 1e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7637346982955933, + "num_tokens": 1172005766.0, + "step": 4076 + }, + { + "epoch": 1.4520035618878007, + "grad_norm": 0.474893718957901, + "learning_rate": 1e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7658510506153107, + "num_tokens": 1172290164.0, + "step": 4077 + }, + { + "epoch": 1.452359750667854, + "grad_norm": 0.4843420684337616, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7654171884059906, + "num_tokens": 1172566618.0, + "step": 4078 + }, + { + "epoch": 1.4527159394479074, + "grad_norm": 0.48745426535606384, + "learning_rate": 1e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7760065644979477, + "num_tokens": 1172832754.0, + "step": 4079 + }, + { + "epoch": 1.4530721282279608, + "grad_norm": 0.473130464553833, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7299058437347412, + "num_tokens": 1173130338.0, + "step": 4080 + }, + { + "epoch": 1.4534283170080142, + "grad_norm": 0.49727049469947815, + "learning_rate": 1e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7643715590238571, + "num_tokens": 1173418307.0, + "step": 4081 + }, + { + "epoch": 1.4537845057880676, + "grad_norm": 0.49815353751182556, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7774509191513062, + "num_tokens": 1173706032.0, + "step": 4082 + }, + { + "epoch": 1.454140694568121, + "grad_norm": 0.4351034164428711, + "learning_rate": 1e-06, + "loss": 0.696, + "mean_token_accuracy": 0.7798857688903809, + "num_tokens": 1173996202.0, + "step": 4083 + }, + { + "epoch": 1.4544968833481744, + "grad_norm": 0.4441418945789337, + "learning_rate": 1e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7591328471899033, + "num_tokens": 1174304595.0, + "step": 4084 + }, + { + "epoch": 1.454853072128228, + "grad_norm": 0.46712878346443176, + "learning_rate": 1e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7653882950544357, + "num_tokens": 1174577063.0, + "step": 4085 + }, + { + "epoch": 1.4552092609082814, + "grad_norm": 0.4541124105453491, + "learning_rate": 1e-06, + "loss": 0.6834, + "mean_token_accuracy": 0.7786078304052353, + "num_tokens": 1174903094.0, + "step": 4086 + }, + { + "epoch": 1.4555654496883348, + "grad_norm": 0.4966977536678314, + "learning_rate": 1e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.768716886639595, + "num_tokens": 1175174040.0, + "step": 4087 + }, + { + "epoch": 1.4559216384683882, + "grad_norm": 0.42012694478034973, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7728396654129028, + "num_tokens": 1175464746.0, + "step": 4088 + }, + { + "epoch": 1.4562778272484418, + "grad_norm": 0.4787824749946594, + "learning_rate": 1e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7732479423284531, + "num_tokens": 1175735147.0, + "step": 4089 + }, + { + "epoch": 1.4566340160284952, + "grad_norm": 0.5071926116943359, + "learning_rate": 1e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.7807829082012177, + "num_tokens": 1176006338.0, + "step": 4090 + }, + { + "epoch": 1.4569902048085486, + "grad_norm": 0.507595956325531, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7520291954278946, + "num_tokens": 1176298690.0, + "step": 4091 + }, + { + "epoch": 1.457346393588602, + "grad_norm": 0.4627412259578705, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7658752202987671, + "num_tokens": 1176583826.0, + "step": 4092 + }, + { + "epoch": 1.4577025823686554, + "grad_norm": 0.48866185545921326, + "learning_rate": 1e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7708124071359634, + "num_tokens": 1176834875.0, + "step": 4093 + }, + { + "epoch": 1.4580587711487087, + "grad_norm": 0.4798599183559418, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7769210487604141, + "num_tokens": 1177136803.0, + "step": 4094 + }, + { + "epoch": 1.4584149599287621, + "grad_norm": 0.47469818592071533, + "learning_rate": 1e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7717839777469635, + "num_tokens": 1177397348.0, + "step": 4095 + }, + { + "epoch": 1.4587711487088157, + "grad_norm": 0.4447728395462036, + "learning_rate": 1e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.7739448845386505, + "num_tokens": 1177680052.0, + "step": 4096 + }, + { + "epoch": 1.4591273374888691, + "grad_norm": 0.48709189891815186, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7514007091522217, + "num_tokens": 1177967559.0, + "step": 4097 + }, + { + "epoch": 1.4594835262689225, + "grad_norm": 0.4138496220111847, + "learning_rate": 1e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7602780014276505, + "num_tokens": 1178267366.0, + "step": 4098 + }, + { + "epoch": 1.459839715048976, + "grad_norm": 0.44062313437461853, + "learning_rate": 1e-06, + "loss": 0.6564, + "mean_token_accuracy": 0.7836157828569412, + "num_tokens": 1178562762.0, + "step": 4099 + }, + { + "epoch": 1.4601959038290293, + "grad_norm": 0.4832735061645508, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7627318948507309, + "num_tokens": 1178835396.0, + "step": 4100 + }, + { + "epoch": 1.460552092609083, + "grad_norm": 0.5197393894195557, + "learning_rate": 1e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.760987251996994, + "num_tokens": 1179095222.0, + "step": 4101 + }, + { + "epoch": 1.4609082813891363, + "grad_norm": 0.522682249546051, + "learning_rate": 1e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7683260291814804, + "num_tokens": 1179386156.0, + "step": 4102 + }, + { + "epoch": 1.4612644701691897, + "grad_norm": 0.442167192697525, + "learning_rate": 1e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7622041255235672, + "num_tokens": 1179708942.0, + "step": 4103 + }, + { + "epoch": 1.461620658949243, + "grad_norm": 0.4779345393180847, + "learning_rate": 1e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.7751368880271912, + "num_tokens": 1179989569.0, + "step": 4104 + }, + { + "epoch": 1.4619768477292965, + "grad_norm": 0.46734654903411865, + "learning_rate": 1e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7546325623989105, + "num_tokens": 1180296065.0, + "step": 4105 + }, + { + "epoch": 1.4623330365093499, + "grad_norm": 0.5004633069038391, + "learning_rate": 1e-06, + "loss": 0.7087, + "mean_token_accuracy": 0.7648300677537918, + "num_tokens": 1180551859.0, + "step": 4106 + }, + { + "epoch": 1.4626892252894033, + "grad_norm": 0.5110118985176086, + "learning_rate": 1e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.7463048845529556, + "num_tokens": 1180825598.0, + "step": 4107 + }, + { + "epoch": 1.4630454140694569, + "grad_norm": 0.5102297067642212, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7550309747457504, + "num_tokens": 1181098251.0, + "step": 4108 + }, + { + "epoch": 1.4634016028495103, + "grad_norm": 0.45482274889945984, + "learning_rate": 1e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.7731514573097229, + "num_tokens": 1181385638.0, + "step": 4109 + }, + { + "epoch": 1.4637577916295637, + "grad_norm": 0.47416478395462036, + "learning_rate": 1e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7715471684932709, + "num_tokens": 1181674838.0, + "step": 4110 + }, + { + "epoch": 1.464113980409617, + "grad_norm": 0.5194635391235352, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7552285343408585, + "num_tokens": 1181930885.0, + "step": 4111 + }, + { + "epoch": 1.4644701691896707, + "grad_norm": 0.47554877400398254, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7532041966915131, + "num_tokens": 1182220706.0, + "step": 4112 + }, + { + "epoch": 1.464826357969724, + "grad_norm": 0.5048959851264954, + "learning_rate": 1e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7743562757968903, + "num_tokens": 1182484463.0, + "step": 4113 + }, + { + "epoch": 1.4651825467497774, + "grad_norm": 0.4989798069000244, + "learning_rate": 1e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.772890254855156, + "num_tokens": 1182769572.0, + "step": 4114 + }, + { + "epoch": 1.4655387355298308, + "grad_norm": 0.43727079033851624, + "learning_rate": 1e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7735381573438644, + "num_tokens": 1183095351.0, + "step": 4115 + }, + { + "epoch": 1.4658949243098842, + "grad_norm": 0.4388032853603363, + "learning_rate": 1e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.763400673866272, + "num_tokens": 1183407868.0, + "step": 4116 + }, + { + "epoch": 1.4662511130899376, + "grad_norm": 0.4978811740875244, + "learning_rate": 1e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.743686705827713, + "num_tokens": 1183680777.0, + "step": 4117 + }, + { + "epoch": 1.466607301869991, + "grad_norm": 0.4821574091911316, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7579436749219894, + "num_tokens": 1183979331.0, + "step": 4118 + }, + { + "epoch": 1.4669634906500444, + "grad_norm": 0.4917231500148773, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7754008024930954, + "num_tokens": 1184242875.0, + "step": 4119 + }, + { + "epoch": 1.467319679430098, + "grad_norm": 0.4899531900882721, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7642636597156525, + "num_tokens": 1184530562.0, + "step": 4120 + }, + { + "epoch": 1.4676758682101514, + "grad_norm": 0.4388747215270996, + "learning_rate": 1e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7695395946502686, + "num_tokens": 1184826055.0, + "step": 4121 + }, + { + "epoch": 1.4680320569902048, + "grad_norm": 0.4512746334075928, + "learning_rate": 1e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7734736800193787, + "num_tokens": 1185121614.0, + "step": 4122 + }, + { + "epoch": 1.4683882457702582, + "grad_norm": 0.50075364112854, + "learning_rate": 1e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7728550732135773, + "num_tokens": 1185409156.0, + "step": 4123 + }, + { + "epoch": 1.4687444345503118, + "grad_norm": 0.5077800154685974, + "learning_rate": 1e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7633328139781952, + "num_tokens": 1185680574.0, + "step": 4124 + }, + { + "epoch": 1.4691006233303652, + "grad_norm": 0.5185083150863647, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7616479992866516, + "num_tokens": 1185958762.0, + "step": 4125 + }, + { + "epoch": 1.4694568121104186, + "grad_norm": 0.46233516931533813, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7488412410020828, + "num_tokens": 1186275613.0, + "step": 4126 + }, + { + "epoch": 1.469813000890472, + "grad_norm": 0.477653831243515, + "learning_rate": 1e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.7866514623165131, + "num_tokens": 1186554006.0, + "step": 4127 + }, + { + "epoch": 1.4701691896705253, + "grad_norm": 0.49341049790382385, + "learning_rate": 1e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7734145820140839, + "num_tokens": 1186815866.0, + "step": 4128 + }, + { + "epoch": 1.4705253784505787, + "grad_norm": 0.5084570646286011, + "learning_rate": 1e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.7785778194665909, + "num_tokens": 1187106203.0, + "step": 4129 + }, + { + "epoch": 1.4708815672306321, + "grad_norm": 0.487434059381485, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7722450345754623, + "num_tokens": 1187384793.0, + "step": 4130 + }, + { + "epoch": 1.4712377560106857, + "grad_norm": 0.43819278478622437, + "learning_rate": 1e-06, + "loss": 0.72, + "mean_token_accuracy": 0.7703802734613419, + "num_tokens": 1187695483.0, + "step": 4131 + }, + { + "epoch": 1.4715939447907391, + "grad_norm": 0.48945125937461853, + "learning_rate": 1e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7735040336847305, + "num_tokens": 1187994372.0, + "step": 4132 + }, + { + "epoch": 1.4719501335707925, + "grad_norm": 0.44090670347213745, + "learning_rate": 1e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7650525569915771, + "num_tokens": 1188323170.0, + "step": 4133 + }, + { + "epoch": 1.472306322350846, + "grad_norm": 0.5110030174255371, + "learning_rate": 1e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7489064335823059, + "num_tokens": 1188563957.0, + "step": 4134 + }, + { + "epoch": 1.4726625111308993, + "grad_norm": 0.4506116211414337, + "learning_rate": 1e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7677131295204163, + "num_tokens": 1188857027.0, + "step": 4135 + }, + { + "epoch": 1.473018699910953, + "grad_norm": 0.44076791405677795, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7709310501813889, + "num_tokens": 1189161428.0, + "step": 4136 + }, + { + "epoch": 1.4733748886910063, + "grad_norm": 0.45249539613723755, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7697314620018005, + "num_tokens": 1189460966.0, + "step": 4137 + }, + { + "epoch": 1.4737310774710597, + "grad_norm": 0.48648157715797424, + "learning_rate": 1e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.7505390495061874, + "num_tokens": 1189712290.0, + "step": 4138 + }, + { + "epoch": 1.474087266251113, + "grad_norm": 0.42620059847831726, + "learning_rate": 1e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7754191160202026, + "num_tokens": 1189998511.0, + "step": 4139 + }, + { + "epoch": 1.4744434550311665, + "grad_norm": 0.45510968565940857, + "learning_rate": 1e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7708309143781662, + "num_tokens": 1190294612.0, + "step": 4140 + }, + { + "epoch": 1.4747996438112199, + "grad_norm": 0.5062077641487122, + "learning_rate": 1e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7733506709337234, + "num_tokens": 1190570530.0, + "step": 4141 + }, + { + "epoch": 1.4751558325912733, + "grad_norm": 0.4695132374763489, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7617789804935455, + "num_tokens": 1190838769.0, + "step": 4142 + }, + { + "epoch": 1.4755120213713269, + "grad_norm": 0.43751391768455505, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7671920359134674, + "num_tokens": 1191145722.0, + "step": 4143 + }, + { + "epoch": 1.4758682101513803, + "grad_norm": 0.4185616075992584, + "learning_rate": 1e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7656810134649277, + "num_tokens": 1191426488.0, + "step": 4144 + }, + { + "epoch": 1.4762243989314336, + "grad_norm": 0.46119511127471924, + "learning_rate": 1e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7676657736301422, + "num_tokens": 1191702137.0, + "step": 4145 + }, + { + "epoch": 1.476580587711487, + "grad_norm": 0.46804726123809814, + "learning_rate": 1e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7843085378408432, + "num_tokens": 1192003323.0, + "step": 4146 + }, + { + "epoch": 1.4769367764915406, + "grad_norm": 0.5012897849082947, + "learning_rate": 1e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.757644534111023, + "num_tokens": 1192259189.0, + "step": 4147 + }, + { + "epoch": 1.477292965271594, + "grad_norm": 0.48242685198783875, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7514459043741226, + "num_tokens": 1192544365.0, + "step": 4148 + }, + { + "epoch": 1.4776491540516474, + "grad_norm": 0.4881986677646637, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7746480703353882, + "num_tokens": 1192797369.0, + "step": 4149 + }, + { + "epoch": 1.4780053428317008, + "grad_norm": 0.5179899334907532, + "learning_rate": 1e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7528262287378311, + "num_tokens": 1193055637.0, + "step": 4150 + }, + { + "epoch": 1.4783615316117542, + "grad_norm": 0.470580130815506, + "learning_rate": 1e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.754232183098793, + "num_tokens": 1193340168.0, + "step": 4151 + }, + { + "epoch": 1.4787177203918076, + "grad_norm": 0.47831428050994873, + "learning_rate": 1e-06, + "loss": 0.6699, + "mean_token_accuracy": 0.7814831435680389, + "num_tokens": 1193645325.0, + "step": 4152 + }, + { + "epoch": 1.479073909171861, + "grad_norm": 0.5120862126350403, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7391801476478577, + "num_tokens": 1193884955.0, + "step": 4153 + }, + { + "epoch": 1.4794300979519144, + "grad_norm": 0.47498950362205505, + "learning_rate": 1e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7773189842700958, + "num_tokens": 1194167249.0, + "step": 4154 + }, + { + "epoch": 1.479786286731968, + "grad_norm": 0.4664655327796936, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7639521062374115, + "num_tokens": 1194445296.0, + "step": 4155 + }, + { + "epoch": 1.4801424755120214, + "grad_norm": 0.4843697249889374, + "learning_rate": 1e-06, + "loss": 0.759, + "mean_token_accuracy": 0.7564477473497391, + "num_tokens": 1194717484.0, + "step": 4156 + }, + { + "epoch": 1.4804986642920748, + "grad_norm": 0.5052169561386108, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7563247084617615, + "num_tokens": 1194994320.0, + "step": 4157 + }, + { + "epoch": 1.4808548530721282, + "grad_norm": 0.5223889946937561, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7686818689107895, + "num_tokens": 1195247015.0, + "step": 4158 + }, + { + "epoch": 1.4812110418521818, + "grad_norm": 0.4700487554073334, + "learning_rate": 1e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7724321186542511, + "num_tokens": 1195519525.0, + "step": 4159 + }, + { + "epoch": 1.4815672306322352, + "grad_norm": 0.4606248736381531, + "learning_rate": 1e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.779153898358345, + "num_tokens": 1195798749.0, + "step": 4160 + }, + { + "epoch": 1.4819234194122886, + "grad_norm": 0.47839826345443726, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7624248117208481, + "num_tokens": 1196078223.0, + "step": 4161 + }, + { + "epoch": 1.482279608192342, + "grad_norm": 0.505441427230835, + "learning_rate": 1e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7526623904705048, + "num_tokens": 1196341034.0, + "step": 4162 + }, + { + "epoch": 1.4826357969723953, + "grad_norm": 0.4630766212940216, + "learning_rate": 1e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.7703706920146942, + "num_tokens": 1196649932.0, + "step": 4163 + }, + { + "epoch": 1.4829919857524487, + "grad_norm": 0.44526705145835876, + "learning_rate": 1e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7618830353021622, + "num_tokens": 1196934684.0, + "step": 4164 + }, + { + "epoch": 1.4833481745325021, + "grad_norm": 0.48803049325942993, + "learning_rate": 1e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.775346115231514, + "num_tokens": 1197212325.0, + "step": 4165 + }, + { + "epoch": 1.4837043633125557, + "grad_norm": 0.47340258955955505, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7431514710187912, + "num_tokens": 1197473468.0, + "step": 4166 + }, + { + "epoch": 1.4840605520926091, + "grad_norm": 0.4564134180545807, + "learning_rate": 1e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7570710778236389, + "num_tokens": 1197764220.0, + "step": 4167 + }, + { + "epoch": 1.4844167408726625, + "grad_norm": 0.4870818853378296, + "learning_rate": 1e-06, + "loss": 0.695, + "mean_token_accuracy": 0.7749391049146652, + "num_tokens": 1198061412.0, + "step": 4168 + }, + { + "epoch": 1.484772929652716, + "grad_norm": 0.4544984698295593, + "learning_rate": 1e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7695904821157455, + "num_tokens": 1198368114.0, + "step": 4169 + }, + { + "epoch": 1.4851291184327693, + "grad_norm": 0.44032734632492065, + "learning_rate": 1e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.7734810709953308, + "num_tokens": 1198675138.0, + "step": 4170 + }, + { + "epoch": 1.485485307212823, + "grad_norm": 0.4873647093772888, + "learning_rate": 1e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7737401723861694, + "num_tokens": 1198937696.0, + "step": 4171 + }, + { + "epoch": 1.4858414959928763, + "grad_norm": 0.5222610831260681, + "learning_rate": 1e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7465291023254395, + "num_tokens": 1199215039.0, + "step": 4172 + }, + { + "epoch": 1.4861976847729297, + "grad_norm": 0.4929622411727905, + "learning_rate": 1e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7785497456789017, + "num_tokens": 1199504674.0, + "step": 4173 + }, + { + "epoch": 1.486553873552983, + "grad_norm": 0.4521874785423279, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7645779103040695, + "num_tokens": 1199784561.0, + "step": 4174 + }, + { + "epoch": 1.4869100623330365, + "grad_norm": 0.44128963351249695, + "learning_rate": 1e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7822135984897614, + "num_tokens": 1200078222.0, + "step": 4175 + }, + { + "epoch": 1.4872662511130899, + "grad_norm": 0.5156322121620178, + "learning_rate": 1e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.7683339864015579, + "num_tokens": 1200356058.0, + "step": 4176 + }, + { + "epoch": 1.4876224398931432, + "grad_norm": 0.48733776807785034, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7689052075147629, + "num_tokens": 1200635166.0, + "step": 4177 + }, + { + "epoch": 1.4879786286731969, + "grad_norm": 0.4742787182331085, + "learning_rate": 1e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7685070931911469, + "num_tokens": 1200927740.0, + "step": 4178 + }, + { + "epoch": 1.4883348174532502, + "grad_norm": 0.46825191378593445, + "learning_rate": 1e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7791215777397156, + "num_tokens": 1201203896.0, + "step": 4179 + }, + { + "epoch": 1.4886910062333036, + "grad_norm": 0.46739643812179565, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7586089819669724, + "num_tokens": 1201474146.0, + "step": 4180 + }, + { + "epoch": 1.489047195013357, + "grad_norm": 0.4672260880470276, + "learning_rate": 1e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7660354524850845, + "num_tokens": 1201768108.0, + "step": 4181 + }, + { + "epoch": 1.4894033837934106, + "grad_norm": 0.5052569508552551, + "learning_rate": 1e-06, + "loss": 0.7159, + "mean_token_accuracy": 0.7700063139200211, + "num_tokens": 1202016979.0, + "step": 4182 + }, + { + "epoch": 1.489759572573464, + "grad_norm": 0.475739985704422, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7654332220554352, + "num_tokens": 1202290953.0, + "step": 4183 + }, + { + "epoch": 1.4901157613535174, + "grad_norm": 0.4573538601398468, + "learning_rate": 1e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7774403989315033, + "num_tokens": 1202598154.0, + "step": 4184 + }, + { + "epoch": 1.4904719501335708, + "grad_norm": 0.45284757018089294, + "learning_rate": 1e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.772826299071312, + "num_tokens": 1202905292.0, + "step": 4185 + }, + { + "epoch": 1.4908281389136242, + "grad_norm": 0.4803445041179657, + "learning_rate": 1e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7584325224161148, + "num_tokens": 1203176999.0, + "step": 4186 + }, + { + "epoch": 1.4911843276936776, + "grad_norm": 0.44962596893310547, + "learning_rate": 1e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7641026824712753, + "num_tokens": 1203495671.0, + "step": 4187 + }, + { + "epoch": 1.491540516473731, + "grad_norm": 0.4690602421760559, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7645460665225983, + "num_tokens": 1203783029.0, + "step": 4188 + }, + { + "epoch": 1.4918967052537844, + "grad_norm": 0.4741329550743103, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7670275121927261, + "num_tokens": 1204077539.0, + "step": 4189 + }, + { + "epoch": 1.492252894033838, + "grad_norm": 0.44214656949043274, + "learning_rate": 1e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7799174934625626, + "num_tokens": 1204391110.0, + "step": 4190 + }, + { + "epoch": 1.4926090828138914, + "grad_norm": 0.4794764816761017, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7423671185970306, + "num_tokens": 1204680496.0, + "step": 4191 + }, + { + "epoch": 1.4929652715939448, + "grad_norm": 0.5193971991539001, + "learning_rate": 1e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.75502148270607, + "num_tokens": 1204957352.0, + "step": 4192 + }, + { + "epoch": 1.4933214603739982, + "grad_norm": 0.4976612329483032, + "learning_rate": 1e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7653176188468933, + "num_tokens": 1205242375.0, + "step": 4193 + }, + { + "epoch": 1.4936776491540518, + "grad_norm": 0.5108681917190552, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.754155308008194, + "num_tokens": 1205505514.0, + "step": 4194 + }, + { + "epoch": 1.4940338379341052, + "grad_norm": 0.5513636469841003, + "learning_rate": 1e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7633592784404755, + "num_tokens": 1205770537.0, + "step": 4195 + }, + { + "epoch": 1.4943900267141585, + "grad_norm": 0.5055350065231323, + "learning_rate": 1e-06, + "loss": 0.6459, + "mean_token_accuracy": 0.7871521562337875, + "num_tokens": 1206043882.0, + "step": 4196 + }, + { + "epoch": 1.494746215494212, + "grad_norm": 0.458964079618454, + "learning_rate": 1e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7690022140741348, + "num_tokens": 1206329064.0, + "step": 4197 + }, + { + "epoch": 1.4951024042742653, + "grad_norm": 0.4411281645298004, + "learning_rate": 1e-06, + "loss": 0.6321, + "mean_token_accuracy": 0.7948614656925201, + "num_tokens": 1206646109.0, + "step": 4198 + }, + { + "epoch": 1.4954585930543187, + "grad_norm": 0.5281673669815063, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7552907168865204, + "num_tokens": 1206902950.0, + "step": 4199 + }, + { + "epoch": 1.495814781834372, + "grad_norm": 0.5053934454917908, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7647332102060318, + "num_tokens": 1207199763.0, + "step": 4200 + }, + { + "epoch": 1.4961709706144257, + "grad_norm": 0.5026566982269287, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7599058151245117, + "num_tokens": 1207446010.0, + "step": 4201 + }, + { + "epoch": 1.4965271593944791, + "grad_norm": 0.4463314712047577, + "learning_rate": 1e-06, + "loss": 0.6387, + "mean_token_accuracy": 0.7816451340913773, + "num_tokens": 1207728788.0, + "step": 4202 + }, + { + "epoch": 1.4968833481745325, + "grad_norm": 0.48211100697517395, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7689341306686401, + "num_tokens": 1208018892.0, + "step": 4203 + }, + { + "epoch": 1.497239536954586, + "grad_norm": 0.47122761607170105, + "learning_rate": 1e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7625983953475952, + "num_tokens": 1208316559.0, + "step": 4204 + }, + { + "epoch": 1.4975957257346393, + "grad_norm": 0.4888172149658203, + "learning_rate": 1e-06, + "loss": 0.6718, + "mean_token_accuracy": 0.7837090045213699, + "num_tokens": 1208593883.0, + "step": 4205 + }, + { + "epoch": 1.497951914514693, + "grad_norm": 0.4489794075489044, + "learning_rate": 1e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7555966228246689, + "num_tokens": 1208901611.0, + "step": 4206 + }, + { + "epoch": 1.4983081032947463, + "grad_norm": 0.4735153615474701, + "learning_rate": 1e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7518378049135208, + "num_tokens": 1209177081.0, + "step": 4207 + }, + { + "epoch": 1.4986642920747997, + "grad_norm": 0.4538586139678955, + "learning_rate": 1e-06, + "loss": 0.666, + "mean_token_accuracy": 0.785745233297348, + "num_tokens": 1209477321.0, + "step": 4208 + }, + { + "epoch": 1.499020480854853, + "grad_norm": 0.44143158197402954, + "learning_rate": 1e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7665117681026459, + "num_tokens": 1209773306.0, + "step": 4209 + }, + { + "epoch": 1.4993766696349065, + "grad_norm": 0.45886296033859253, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7705270051956177, + "num_tokens": 1210094810.0, + "step": 4210 + }, + { + "epoch": 1.4997328584149598, + "grad_norm": 0.4734325706958771, + "learning_rate": 1e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7692495286464691, + "num_tokens": 1210369756.0, + "step": 4211 + }, + { + "epoch": 1.5000890471950132, + "grad_norm": 0.47921544313430786, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7398408502340317, + "num_tokens": 1210677657.0, + "step": 4212 + }, + { + "epoch": 1.5004452359750666, + "grad_norm": 0.47170475125312805, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.754845380783081, + "num_tokens": 1210969265.0, + "step": 4213 + }, + { + "epoch": 1.5008014247551202, + "grad_norm": 0.5149799585342407, + "learning_rate": 1e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.7586684226989746, + "num_tokens": 1211250286.0, + "step": 4214 + }, + { + "epoch": 1.5011576135351736, + "grad_norm": 0.45903316140174866, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7501177936792374, + "num_tokens": 1211546920.0, + "step": 4215 + }, + { + "epoch": 1.501513802315227, + "grad_norm": 0.4360673725605011, + "learning_rate": 1e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7747966200113297, + "num_tokens": 1211834078.0, + "step": 4216 + }, + { + "epoch": 1.5018699910952806, + "grad_norm": 0.4452776610851288, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7671398222446442, + "num_tokens": 1212136859.0, + "step": 4217 + }, + { + "epoch": 1.502226179875334, + "grad_norm": 0.4214012622833252, + "learning_rate": 1e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7652258276939392, + "num_tokens": 1212462095.0, + "step": 4218 + }, + { + "epoch": 1.5025823686553874, + "grad_norm": 0.4426431357860565, + "learning_rate": 1e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.7730182260274887, + "num_tokens": 1212736139.0, + "step": 4219 + }, + { + "epoch": 1.5029385574354408, + "grad_norm": 0.4808559715747833, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7431416660547256, + "num_tokens": 1212995891.0, + "step": 4220 + }, + { + "epoch": 1.5032947462154942, + "grad_norm": 0.47184693813323975, + "learning_rate": 1e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7524614036083221, + "num_tokens": 1213284068.0, + "step": 4221 + }, + { + "epoch": 1.5036509349955476, + "grad_norm": 0.48245224356651306, + "learning_rate": 1e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.743750125169754, + "num_tokens": 1213552268.0, + "step": 4222 + }, + { + "epoch": 1.504007123775601, + "grad_norm": 0.47978919744491577, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7623315155506134, + "num_tokens": 1213847872.0, + "step": 4223 + }, + { + "epoch": 1.5043633125556544, + "grad_norm": 0.46267566084861755, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7614514827728271, + "num_tokens": 1214115313.0, + "step": 4224 + }, + { + "epoch": 1.504719501335708, + "grad_norm": 0.466996967792511, + "learning_rate": 1e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7596570253372192, + "num_tokens": 1214392060.0, + "step": 4225 + }, + { + "epoch": 1.5050756901157614, + "grad_norm": 0.4727163314819336, + "learning_rate": 1e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7721693068742752, + "num_tokens": 1214701696.0, + "step": 4226 + }, + { + "epoch": 1.5054318788958148, + "grad_norm": 0.46675294637680054, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7525193393230438, + "num_tokens": 1214984729.0, + "step": 4227 + }, + { + "epoch": 1.5057880676758684, + "grad_norm": 0.48952677845954895, + "learning_rate": 1e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7714290469884872, + "num_tokens": 1215254763.0, + "step": 4228 + }, + { + "epoch": 1.5061442564559218, + "grad_norm": 0.48324060440063477, + "learning_rate": 1e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7533996105194092, + "num_tokens": 1215528503.0, + "step": 4229 + }, + { + "epoch": 1.5065004452359751, + "grad_norm": 0.47149521112442017, + "learning_rate": 1e-06, + "loss": 0.6714, + "mean_token_accuracy": 0.7838516086339951, + "num_tokens": 1215819883.0, + "step": 4230 + }, + { + "epoch": 1.5068566340160285, + "grad_norm": 0.4854046404361725, + "learning_rate": 1e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7693105041980743, + "num_tokens": 1216105999.0, + "step": 4231 + }, + { + "epoch": 1.507212822796082, + "grad_norm": 0.4497813880443573, + "learning_rate": 1e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.7785116583108902, + "num_tokens": 1216399514.0, + "step": 4232 + }, + { + "epoch": 1.5075690115761353, + "grad_norm": 0.5427193641662598, + "learning_rate": 1e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7750900834798813, + "num_tokens": 1216674405.0, + "step": 4233 + }, + { + "epoch": 1.5079252003561887, + "grad_norm": 0.4327215552330017, + "learning_rate": 1e-06, + "loss": 0.6641, + "mean_token_accuracy": 0.7853349149227142, + "num_tokens": 1216999970.0, + "step": 4234 + }, + { + "epoch": 1.508281389136242, + "grad_norm": 0.4557473063468933, + "learning_rate": 1e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7521520256996155, + "num_tokens": 1217307141.0, + "step": 4235 + }, + { + "epoch": 1.5086375779162955, + "grad_norm": 0.4634695053100586, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7624821215867996, + "num_tokens": 1217604883.0, + "step": 4236 + }, + { + "epoch": 1.508993766696349, + "grad_norm": 0.4316052496433258, + "learning_rate": 1e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7570154219865799, + "num_tokens": 1217915831.0, + "step": 4237 + }, + { + "epoch": 1.5093499554764025, + "grad_norm": 0.4973883330821991, + "learning_rate": 1e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7615836560726166, + "num_tokens": 1218166082.0, + "step": 4238 + }, + { + "epoch": 1.5097061442564559, + "grad_norm": 0.4939664304256439, + "learning_rate": 1e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7564683258533478, + "num_tokens": 1218451093.0, + "step": 4239 + }, + { + "epoch": 1.5100623330365095, + "grad_norm": 0.4598694443702698, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7649083435535431, + "num_tokens": 1218780646.0, + "step": 4240 + }, + { + "epoch": 1.5104185218165629, + "grad_norm": 0.5459133982658386, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7596654295921326, + "num_tokens": 1219047904.0, + "step": 4241 + }, + { + "epoch": 1.5107747105966163, + "grad_norm": 0.5128939747810364, + "learning_rate": 1e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7674064785242081, + "num_tokens": 1219315726.0, + "step": 4242 + }, + { + "epoch": 1.5111308993766697, + "grad_norm": 0.4183829426765442, + "learning_rate": 1e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.77152980864048, + "num_tokens": 1219659118.0, + "step": 4243 + }, + { + "epoch": 1.511487088156723, + "grad_norm": 0.4794139564037323, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7674021869897842, + "num_tokens": 1219947148.0, + "step": 4244 + }, + { + "epoch": 1.5118432769367764, + "grad_norm": 0.4644397497177124, + "learning_rate": 1e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7743272334337234, + "num_tokens": 1220219153.0, + "step": 4245 + }, + { + "epoch": 1.5121994657168298, + "grad_norm": 0.44187355041503906, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.770942822098732, + "num_tokens": 1220530645.0, + "step": 4246 + }, + { + "epoch": 1.5125556544968832, + "grad_norm": 0.516531765460968, + "learning_rate": 1e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7582550644874573, + "num_tokens": 1220816623.0, + "step": 4247 + }, + { + "epoch": 1.5129118432769366, + "grad_norm": 0.47907114028930664, + "learning_rate": 1e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7712732553482056, + "num_tokens": 1221099238.0, + "step": 4248 + }, + { + "epoch": 1.5132680320569902, + "grad_norm": 0.5307239294052124, + "learning_rate": 1e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7613063901662827, + "num_tokens": 1221369175.0, + "step": 4249 + }, + { + "epoch": 1.5136242208370436, + "grad_norm": 0.4836655855178833, + "learning_rate": 1e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7791010588407516, + "num_tokens": 1221673640.0, + "step": 4250 + }, + { + "epoch": 1.513980409617097, + "grad_norm": 0.4650121033191681, + "learning_rate": 1e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7543721795082092, + "num_tokens": 1221951619.0, + "step": 4251 + }, + { + "epoch": 1.5143365983971506, + "grad_norm": 0.448201060295105, + "learning_rate": 1e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7798407524824142, + "num_tokens": 1222257865.0, + "step": 4252 + }, + { + "epoch": 1.514692787177204, + "grad_norm": 0.4685947895050049, + "learning_rate": 1e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7657839059829712, + "num_tokens": 1222538082.0, + "step": 4253 + }, + { + "epoch": 1.5150489759572574, + "grad_norm": 0.5112437605857849, + "learning_rate": 1e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7645119577646255, + "num_tokens": 1222821200.0, + "step": 4254 + }, + { + "epoch": 1.5154051647373108, + "grad_norm": 0.4368942677974701, + "learning_rate": 1e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.7778719961643219, + "num_tokens": 1223164440.0, + "step": 4255 + }, + { + "epoch": 1.5157613535173642, + "grad_norm": 0.4981898367404938, + "learning_rate": 1e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7784046530723572, + "num_tokens": 1223443969.0, + "step": 4256 + }, + { + "epoch": 1.5161175422974176, + "grad_norm": 0.4253232181072235, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7606494575738907, + "num_tokens": 1223750109.0, + "step": 4257 + }, + { + "epoch": 1.516473731077471, + "grad_norm": 0.47008296847343445, + "learning_rate": 1e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7679232656955719, + "num_tokens": 1224061782.0, + "step": 4258 + }, + { + "epoch": 1.5168299198575244, + "grad_norm": 0.4533368945121765, + "learning_rate": 1e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7738271057605743, + "num_tokens": 1224356708.0, + "step": 4259 + }, + { + "epoch": 1.517186108637578, + "grad_norm": 0.4236413538455963, + "learning_rate": 1e-06, + "loss": 0.6782, + "mean_token_accuracy": 0.7802722305059433, + "num_tokens": 1224667448.0, + "step": 4260 + }, + { + "epoch": 1.5175422974176314, + "grad_norm": 0.4424011707305908, + "learning_rate": 1e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.7669848948717117, + "num_tokens": 1224972162.0, + "step": 4261 + }, + { + "epoch": 1.5178984861976847, + "grad_norm": 0.4736715853214264, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7566172480583191, + "num_tokens": 1225291343.0, + "step": 4262 + }, + { + "epoch": 1.5182546749777384, + "grad_norm": 0.4773866832256317, + "learning_rate": 1e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7650832831859589, + "num_tokens": 1225568729.0, + "step": 4263 + }, + { + "epoch": 1.5186108637577918, + "grad_norm": 0.4574570953845978, + "learning_rate": 1e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.7675938308238983, + "num_tokens": 1225893823.0, + "step": 4264 + }, + { + "epoch": 1.5189670525378451, + "grad_norm": 0.43587812781333923, + "learning_rate": 1e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.7806160748004913, + "num_tokens": 1226200403.0, + "step": 4265 + }, + { + "epoch": 1.5193232413178985, + "grad_norm": 0.5095962882041931, + "learning_rate": 1e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7615822702646255, + "num_tokens": 1226449747.0, + "step": 4266 + }, + { + "epoch": 1.519679430097952, + "grad_norm": 0.4564603865146637, + "learning_rate": 1e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7679731696844101, + "num_tokens": 1226775794.0, + "step": 4267 + }, + { + "epoch": 1.5200356188780053, + "grad_norm": 0.46725228428840637, + "learning_rate": 1e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.7744534313678741, + "num_tokens": 1227095765.0, + "step": 4268 + }, + { + "epoch": 1.5203918076580587, + "grad_norm": 0.4849209487438202, + "learning_rate": 1e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7431063503026962, + "num_tokens": 1227338747.0, + "step": 4269 + }, + { + "epoch": 1.520747996438112, + "grad_norm": 0.48721182346343994, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7666630893945694, + "num_tokens": 1227624465.0, + "step": 4270 + }, + { + "epoch": 1.5211041852181655, + "grad_norm": 0.4760516583919525, + "learning_rate": 1e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7633416503667831, + "num_tokens": 1227912002.0, + "step": 4271 + }, + { + "epoch": 1.521460373998219, + "grad_norm": 0.44434159994125366, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7672146111726761, + "num_tokens": 1228183369.0, + "step": 4272 + }, + { + "epoch": 1.5218165627782725, + "grad_norm": 0.48158085346221924, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7691155225038528, + "num_tokens": 1228455381.0, + "step": 4273 + }, + { + "epoch": 1.5221727515583259, + "grad_norm": 0.46062496304512024, + "learning_rate": 1e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7673779428005219, + "num_tokens": 1228735290.0, + "step": 4274 + }, + { + "epoch": 1.5225289403383795, + "grad_norm": 0.46519702672958374, + "learning_rate": 1e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7826924175024033, + "num_tokens": 1229032419.0, + "step": 4275 + }, + { + "epoch": 1.5228851291184329, + "grad_norm": 0.48986178636550903, + "learning_rate": 1e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.765956312417984, + "num_tokens": 1229303318.0, + "step": 4276 + }, + { + "epoch": 1.5232413178984863, + "grad_norm": 0.47807130217552185, + "learning_rate": 1e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7545083612203598, + "num_tokens": 1229557666.0, + "step": 4277 + }, + { + "epoch": 1.5235975066785397, + "grad_norm": 0.5273579359054565, + "learning_rate": 1e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7555869817733765, + "num_tokens": 1229820577.0, + "step": 4278 + }, + { + "epoch": 1.523953695458593, + "grad_norm": 0.49202340841293335, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7642170786857605, + "num_tokens": 1230106772.0, + "step": 4279 + }, + { + "epoch": 1.5243098842386464, + "grad_norm": 0.47802218794822693, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7629006505012512, + "num_tokens": 1230393876.0, + "step": 4280 + }, + { + "epoch": 1.5246660730186998, + "grad_norm": 0.4329967796802521, + "learning_rate": 1e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.760870561003685, + "num_tokens": 1230681300.0, + "step": 4281 + }, + { + "epoch": 1.5250222617987532, + "grad_norm": 0.4859238564968109, + "learning_rate": 1e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7626774609088898, + "num_tokens": 1230956083.0, + "step": 4282 + }, + { + "epoch": 1.5253784505788066, + "grad_norm": 0.4553457200527191, + "learning_rate": 1e-06, + "loss": 0.658, + "mean_token_accuracy": 0.7863836139440536, + "num_tokens": 1231281558.0, + "step": 4283 + }, + { + "epoch": 1.5257346393588602, + "grad_norm": 0.46581199765205383, + "learning_rate": 1e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7572552412748337, + "num_tokens": 1231570506.0, + "step": 4284 + }, + { + "epoch": 1.5260908281389136, + "grad_norm": 0.48177382349967957, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7591952979564667, + "num_tokens": 1231872669.0, + "step": 4285 + }, + { + "epoch": 1.526447016918967, + "grad_norm": 0.47162574529647827, + "learning_rate": 1e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7621413171291351, + "num_tokens": 1232183906.0, + "step": 4286 + }, + { + "epoch": 1.5268032056990206, + "grad_norm": 0.456425279378891, + "learning_rate": 1e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7535152286291122, + "num_tokens": 1232459991.0, + "step": 4287 + }, + { + "epoch": 1.527159394479074, + "grad_norm": 0.5236005783081055, + "learning_rate": 1e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7712657749652863, + "num_tokens": 1232728129.0, + "step": 4288 + }, + { + "epoch": 1.5275155832591274, + "grad_norm": 0.466916561126709, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7655410915613174, + "num_tokens": 1233041071.0, + "step": 4289 + }, + { + "epoch": 1.5278717720391808, + "grad_norm": 0.444923996925354, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7651481926441193, + "num_tokens": 1233356769.0, + "step": 4290 + }, + { + "epoch": 1.5282279608192342, + "grad_norm": 0.5026310682296753, + "learning_rate": 1e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7717760503292084, + "num_tokens": 1233640756.0, + "step": 4291 + }, + { + "epoch": 1.5285841495992876, + "grad_norm": 0.5016206502914429, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.764001652598381, + "num_tokens": 1233888335.0, + "step": 4292 + }, + { + "epoch": 1.528940338379341, + "grad_norm": 0.43480315804481506, + "learning_rate": 1e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7709409445524216, + "num_tokens": 1234204947.0, + "step": 4293 + }, + { + "epoch": 1.5292965271593943, + "grad_norm": 0.5165376663208008, + "learning_rate": 1e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7561906725168228, + "num_tokens": 1234469703.0, + "step": 4294 + }, + { + "epoch": 1.529652715939448, + "grad_norm": 0.48086416721343994, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7713241428136826, + "num_tokens": 1234775276.0, + "step": 4295 + }, + { + "epoch": 1.5300089047195014, + "grad_norm": 0.5352383852005005, + "learning_rate": 1e-06, + "loss": 0.701, + "mean_token_accuracy": 0.774626687169075, + "num_tokens": 1235041109.0, + "step": 4296 + }, + { + "epoch": 1.5303650934995547, + "grad_norm": 0.4320238530635834, + "learning_rate": 1e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7675020843744278, + "num_tokens": 1235322818.0, + "step": 4297 + }, + { + "epoch": 1.5307212822796084, + "grad_norm": 0.48410314321517944, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7506937384605408, + "num_tokens": 1235592450.0, + "step": 4298 + }, + { + "epoch": 1.5310774710596617, + "grad_norm": 0.44469892978668213, + "learning_rate": 1e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7570683509111404, + "num_tokens": 1235939145.0, + "step": 4299 + }, + { + "epoch": 1.5314336598397151, + "grad_norm": 0.4565105140209198, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7743257731199265, + "num_tokens": 1236249623.0, + "step": 4300 + }, + { + "epoch": 1.5317898486197685, + "grad_norm": 0.4531629681587219, + "learning_rate": 1e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7527397572994232, + "num_tokens": 1236532060.0, + "step": 4301 + }, + { + "epoch": 1.532146037399822, + "grad_norm": 0.46719351410865784, + "learning_rate": 1e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7605261504650116, + "num_tokens": 1236820365.0, + "step": 4302 + }, + { + "epoch": 1.5325022261798753, + "grad_norm": 0.4385693371295929, + "learning_rate": 1e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7768382728099823, + "num_tokens": 1237167641.0, + "step": 4303 + }, + { + "epoch": 1.5328584149599287, + "grad_norm": 0.44175946712493896, + "learning_rate": 1e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7621825784444809, + "num_tokens": 1237486897.0, + "step": 4304 + }, + { + "epoch": 1.533214603739982, + "grad_norm": 0.44885435700416565, + "learning_rate": 1e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.7853352725505829, + "num_tokens": 1237781427.0, + "step": 4305 + }, + { + "epoch": 1.5335707925200355, + "grad_norm": 0.4402085244655609, + "learning_rate": 1e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.7564686685800552, + "num_tokens": 1238066148.0, + "step": 4306 + }, + { + "epoch": 1.533926981300089, + "grad_norm": 0.4547639787197113, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7741773277521133, + "num_tokens": 1238357317.0, + "step": 4307 + }, + { + "epoch": 1.5342831700801425, + "grad_norm": 0.46083495020866394, + "learning_rate": 1e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7676101177930832, + "num_tokens": 1238665006.0, + "step": 4308 + }, + { + "epoch": 1.5346393588601959, + "grad_norm": 0.44236528873443604, + "learning_rate": 1e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7614369988441467, + "num_tokens": 1238960183.0, + "step": 4309 + }, + { + "epoch": 1.5349955476402495, + "grad_norm": 0.4252101480960846, + "learning_rate": 1e-06, + "loss": 0.6826, + "mean_token_accuracy": 0.7739776819944382, + "num_tokens": 1239297558.0, + "step": 4310 + }, + { + "epoch": 1.5353517364203029, + "grad_norm": 0.47259756922721863, + "learning_rate": 1e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.775894284248352, + "num_tokens": 1239595821.0, + "step": 4311 + }, + { + "epoch": 1.5357079252003563, + "grad_norm": 0.4766407907009125, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7731916904449463, + "num_tokens": 1239885088.0, + "step": 4312 + }, + { + "epoch": 1.5360641139804097, + "grad_norm": 0.3935624957084656, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.767921581864357, + "num_tokens": 1240206229.0, + "step": 4313 + }, + { + "epoch": 1.536420302760463, + "grad_norm": 0.4554288685321808, + "learning_rate": 1e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7590271681547165, + "num_tokens": 1240455632.0, + "step": 4314 + }, + { + "epoch": 1.5367764915405164, + "grad_norm": 0.46656665205955505, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7680568099021912, + "num_tokens": 1240750634.0, + "step": 4315 + }, + { + "epoch": 1.5371326803205698, + "grad_norm": 0.4500221312046051, + "learning_rate": 1e-06, + "loss": 0.6663, + "mean_token_accuracy": 0.7831415086984634, + "num_tokens": 1241028178.0, + "step": 4316 + }, + { + "epoch": 1.5374888691006232, + "grad_norm": 0.5074875354766846, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7320391535758972, + "num_tokens": 1241311152.0, + "step": 4317 + }, + { + "epoch": 1.5378450578806766, + "grad_norm": 0.4844883680343628, + "learning_rate": 1e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.765711322426796, + "num_tokens": 1241577815.0, + "step": 4318 + }, + { + "epoch": 1.5382012466607302, + "grad_norm": 0.4797791838645935, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7577085196971893, + "num_tokens": 1241891141.0, + "step": 4319 + }, + { + "epoch": 1.5385574354407836, + "grad_norm": 0.45407336950302124, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7696940153837204, + "num_tokens": 1242185538.0, + "step": 4320 + }, + { + "epoch": 1.538913624220837, + "grad_norm": 0.44933372735977173, + "learning_rate": 1e-06, + "loss": 0.6692, + "mean_token_accuracy": 0.7857713103294373, + "num_tokens": 1242473517.0, + "step": 4321 + }, + { + "epoch": 1.5392698130008906, + "grad_norm": 0.43455594778060913, + "learning_rate": 1e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7604698836803436, + "num_tokens": 1242763931.0, + "step": 4322 + }, + { + "epoch": 1.539626001780944, + "grad_norm": 0.47787702083587646, + "learning_rate": 1e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7482419162988663, + "num_tokens": 1243030682.0, + "step": 4323 + }, + { + "epoch": 1.5399821905609974, + "grad_norm": 0.5025812983512878, + "learning_rate": 1e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7623390108346939, + "num_tokens": 1243304724.0, + "step": 4324 + }, + { + "epoch": 1.5403383793410508, + "grad_norm": 0.4261225163936615, + "learning_rate": 1e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7810482680797577, + "num_tokens": 1243596246.0, + "step": 4325 + }, + { + "epoch": 1.5406945681211042, + "grad_norm": 0.4566344916820526, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7682728916406631, + "num_tokens": 1243904468.0, + "step": 4326 + }, + { + "epoch": 1.5410507569011576, + "grad_norm": 0.43373170495033264, + "learning_rate": 1e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7667131721973419, + "num_tokens": 1244212842.0, + "step": 4327 + }, + { + "epoch": 1.541406945681211, + "grad_norm": 0.4627005159854889, + "learning_rate": 1e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.7777293771505356, + "num_tokens": 1244493887.0, + "step": 4328 + }, + { + "epoch": 1.5417631344612643, + "grad_norm": 0.4499332904815674, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7770901620388031, + "num_tokens": 1244820353.0, + "step": 4329 + }, + { + "epoch": 1.542119323241318, + "grad_norm": 0.5285834670066833, + "learning_rate": 1e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.7709513604640961, + "num_tokens": 1245074580.0, + "step": 4330 + }, + { + "epoch": 1.5424755120213713, + "grad_norm": 0.42723047733306885, + "learning_rate": 1e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.76588936150074, + "num_tokens": 1245378302.0, + "step": 4331 + }, + { + "epoch": 1.5428317008014247, + "grad_norm": 0.47446584701538086, + "learning_rate": 1e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.7765232920646667, + "num_tokens": 1245641583.0, + "step": 4332 + }, + { + "epoch": 1.5431878895814783, + "grad_norm": 0.43938717246055603, + "learning_rate": 1e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7608886361122131, + "num_tokens": 1245946907.0, + "step": 4333 + }, + { + "epoch": 1.5435440783615317, + "grad_norm": 0.42458462715148926, + "learning_rate": 1e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7490401118993759, + "num_tokens": 1246243596.0, + "step": 4334 + }, + { + "epoch": 1.5439002671415851, + "grad_norm": 0.45952704548835754, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7624997496604919, + "num_tokens": 1246545499.0, + "step": 4335 + }, + { + "epoch": 1.5442564559216385, + "grad_norm": 0.4479047954082489, + "learning_rate": 1e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7517183572053909, + "num_tokens": 1246822359.0, + "step": 4336 + }, + { + "epoch": 1.544612644701692, + "grad_norm": 0.4929687976837158, + "learning_rate": 1e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7610863149166107, + "num_tokens": 1247104858.0, + "step": 4337 + }, + { + "epoch": 1.5449688334817453, + "grad_norm": 0.48917505145072937, + "learning_rate": 1e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7718068808317184, + "num_tokens": 1247375919.0, + "step": 4338 + }, + { + "epoch": 1.5453250222617987, + "grad_norm": 0.46939942240715027, + "learning_rate": 1e-06, + "loss": 0.6533, + "mean_token_accuracy": 0.7853260785341263, + "num_tokens": 1247649183.0, + "step": 4339 + }, + { + "epoch": 1.545681211041852, + "grad_norm": 0.4539019465446472, + "learning_rate": 1e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7640178799629211, + "num_tokens": 1247960775.0, + "step": 4340 + }, + { + "epoch": 1.5460373998219055, + "grad_norm": 0.4646659195423126, + "learning_rate": 1e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7691512703895569, + "num_tokens": 1248279016.0, + "step": 4341 + }, + { + "epoch": 1.546393588601959, + "grad_norm": 0.4984736740589142, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7304060906171799, + "num_tokens": 1248567788.0, + "step": 4342 + }, + { + "epoch": 1.5467497773820125, + "grad_norm": 0.4567613899707794, + "learning_rate": 1e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7576537132263184, + "num_tokens": 1248869520.0, + "step": 4343 + }, + { + "epoch": 1.5471059661620659, + "grad_norm": 0.45502999424934387, + "learning_rate": 1e-06, + "loss": 0.7428, + "mean_token_accuracy": 0.7704736739397049, + "num_tokens": 1249188681.0, + "step": 4344 + }, + { + "epoch": 1.5474621549421195, + "grad_norm": 0.46271318197250366, + "learning_rate": 1e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.773109033703804, + "num_tokens": 1249465360.0, + "step": 4345 + }, + { + "epoch": 1.5478183437221729, + "grad_norm": 0.4918312728404999, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7649189531803131, + "num_tokens": 1249757989.0, + "step": 4346 + }, + { + "epoch": 1.5481745325022263, + "grad_norm": 0.4784466326236725, + "learning_rate": 1e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.754507839679718, + "num_tokens": 1250048996.0, + "step": 4347 + }, + { + "epoch": 1.5485307212822796, + "grad_norm": 0.4843384623527527, + "learning_rate": 1e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.755059003829956, + "num_tokens": 1250331696.0, + "step": 4348 + }, + { + "epoch": 1.548886910062333, + "grad_norm": 0.4736637473106384, + "learning_rate": 1e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7703058868646622, + "num_tokens": 1250616879.0, + "step": 4349 + }, + { + "epoch": 1.5492430988423864, + "grad_norm": 0.4554233253002167, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7466762363910675, + "num_tokens": 1250868757.0, + "step": 4350 + }, + { + "epoch": 1.5495992876224398, + "grad_norm": 0.44350191950798035, + "learning_rate": 1e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7766771465539932, + "num_tokens": 1251190143.0, + "step": 4351 + }, + { + "epoch": 1.5499554764024932, + "grad_norm": 0.4212508201599121, + "learning_rate": 1e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.7829677760601044, + "num_tokens": 1251517654.0, + "step": 4352 + }, + { + "epoch": 1.5503116651825466, + "grad_norm": 0.4568743407726288, + "learning_rate": 1e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7596513777971268, + "num_tokens": 1251793796.0, + "step": 4353 + }, + { + "epoch": 1.5506678539626002, + "grad_norm": 0.4878534972667694, + "learning_rate": 1e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7754004001617432, + "num_tokens": 1252090944.0, + "step": 4354 + }, + { + "epoch": 1.5510240427426536, + "grad_norm": 0.47705385088920593, + "learning_rate": 1e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7497192621231079, + "num_tokens": 1252355647.0, + "step": 4355 + }, + { + "epoch": 1.551380231522707, + "grad_norm": 0.48563313484191895, + "learning_rate": 1e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7785743623971939, + "num_tokens": 1252651602.0, + "step": 4356 + }, + { + "epoch": 1.5517364203027606, + "grad_norm": 0.46786466240882874, + "learning_rate": 1e-06, + "loss": 0.6369, + "mean_token_accuracy": 0.7895199209451675, + "num_tokens": 1252922835.0, + "step": 4357 + }, + { + "epoch": 1.552092609082814, + "grad_norm": 0.4677254855632782, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7673633396625519, + "num_tokens": 1253205323.0, + "step": 4358 + }, + { + "epoch": 1.5524487978628674, + "grad_norm": 0.46559610962867737, + "learning_rate": 1e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7587781399488449, + "num_tokens": 1253501953.0, + "step": 4359 + }, + { + "epoch": 1.5528049866429208, + "grad_norm": 0.5012574791908264, + "learning_rate": 1e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.7579664289951324, + "num_tokens": 1253749438.0, + "step": 4360 + }, + { + "epoch": 1.5531611754229742, + "grad_norm": 0.5043085813522339, + "learning_rate": 1e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7504813522100449, + "num_tokens": 1254010339.0, + "step": 4361 + }, + { + "epoch": 1.5535173642030276, + "grad_norm": 0.4830804467201233, + "learning_rate": 1e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7748291790485382, + "num_tokens": 1254292585.0, + "step": 4362 + }, + { + "epoch": 1.553873552983081, + "grad_norm": 0.4412928521633148, + "learning_rate": 1e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7624788880348206, + "num_tokens": 1254601633.0, + "step": 4363 + }, + { + "epoch": 1.5542297417631343, + "grad_norm": 0.5121128559112549, + "learning_rate": 1e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7674920409917831, + "num_tokens": 1254867716.0, + "step": 4364 + }, + { + "epoch": 1.554585930543188, + "grad_norm": 0.49385127425193787, + "learning_rate": 1e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.7728355824947357, + "num_tokens": 1255138456.0, + "step": 4365 + }, + { + "epoch": 1.5549421193232413, + "grad_norm": 0.47373050451278687, + "learning_rate": 1e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.771727129817009, + "num_tokens": 1255485190.0, + "step": 4366 + }, + { + "epoch": 1.5552983081032947, + "grad_norm": 0.49672845005989075, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7530267685651779, + "num_tokens": 1255755889.0, + "step": 4367 + }, + { + "epoch": 1.5556544968833483, + "grad_norm": 0.3937080204486847, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7684490233659744, + "num_tokens": 1256043905.0, + "step": 4368 + }, + { + "epoch": 1.5560106856634017, + "grad_norm": 0.4993888735771179, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.759073942899704, + "num_tokens": 1256297055.0, + "step": 4369 + }, + { + "epoch": 1.5563668744434551, + "grad_norm": 0.4547118544578552, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7627906948328018, + "num_tokens": 1256579598.0, + "step": 4370 + }, + { + "epoch": 1.5567230632235085, + "grad_norm": 0.5227458477020264, + "learning_rate": 1e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7680097222328186, + "num_tokens": 1256870740.0, + "step": 4371 + }, + { + "epoch": 1.557079252003562, + "grad_norm": 0.48470473289489746, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7710845172405243, + "num_tokens": 1257141686.0, + "step": 4372 + }, + { + "epoch": 1.5574354407836153, + "grad_norm": 0.47481754422187805, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7611166834831238, + "num_tokens": 1257402265.0, + "step": 4373 + }, + { + "epoch": 1.5577916295636687, + "grad_norm": 0.45655983686447144, + "learning_rate": 1e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7583801746368408, + "num_tokens": 1257714021.0, + "step": 4374 + }, + { + "epoch": 1.558147818343722, + "grad_norm": 0.4781776964664459, + "learning_rate": 1e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7559598088264465, + "num_tokens": 1258021872.0, + "step": 4375 + }, + { + "epoch": 1.5585040071237755, + "grad_norm": 0.45797067880630493, + "learning_rate": 1e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7510824501514435, + "num_tokens": 1258302576.0, + "step": 4376 + }, + { + "epoch": 1.558860195903829, + "grad_norm": 0.4646289646625519, + "learning_rate": 1e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7637118548154831, + "num_tokens": 1258595813.0, + "step": 4377 + }, + { + "epoch": 1.5592163846838825, + "grad_norm": 0.4396739900112152, + "learning_rate": 1e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7624122500419617, + "num_tokens": 1258876599.0, + "step": 4378 + }, + { + "epoch": 1.5595725734639359, + "grad_norm": 0.47187596559524536, + "learning_rate": 1e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7565261423587799, + "num_tokens": 1259182730.0, + "step": 4379 + }, + { + "epoch": 1.5599287622439895, + "grad_norm": 0.43781235814094543, + "learning_rate": 1e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.751617357134819, + "num_tokens": 1259468909.0, + "step": 4380 + }, + { + "epoch": 1.5602849510240429, + "grad_norm": 0.49970197677612305, + "learning_rate": 1e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7649587690830231, + "num_tokens": 1259748234.0, + "step": 4381 + }, + { + "epoch": 1.5606411398040962, + "grad_norm": 0.5208747386932373, + "learning_rate": 1e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7809955179691315, + "num_tokens": 1260022839.0, + "step": 4382 + }, + { + "epoch": 1.5609973285841496, + "grad_norm": 0.46386048197746277, + "learning_rate": 1e-06, + "loss": 0.7449, + "mean_token_accuracy": 0.7624006122350693, + "num_tokens": 1260304152.0, + "step": 4383 + }, + { + "epoch": 1.561353517364203, + "grad_norm": 0.48509517312049866, + "learning_rate": 1e-06, + "loss": 0.6768, + "mean_token_accuracy": 0.7789046913385391, + "num_tokens": 1260585446.0, + "step": 4384 + }, + { + "epoch": 1.5617097061442564, + "grad_norm": 0.4736153781414032, + "learning_rate": 1e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7717471420764923, + "num_tokens": 1260862697.0, + "step": 4385 + }, + { + "epoch": 1.5620658949243098, + "grad_norm": 0.44308528304100037, + "learning_rate": 1e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.7825842797756195, + "num_tokens": 1261190641.0, + "step": 4386 + }, + { + "epoch": 1.5624220837043632, + "grad_norm": 0.44590604305267334, + "learning_rate": 1e-06, + "loss": 0.614, + "mean_token_accuracy": 0.8012235909700394, + "num_tokens": 1261459618.0, + "step": 4387 + }, + { + "epoch": 1.5627782724844166, + "grad_norm": 0.48529142141342163, + "learning_rate": 1e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7610747516155243, + "num_tokens": 1261724215.0, + "step": 4388 + }, + { + "epoch": 1.5631344612644702, + "grad_norm": 0.45731136202812195, + "learning_rate": 1e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7789942473173141, + "num_tokens": 1262023987.0, + "step": 4389 + }, + { + "epoch": 1.5634906500445236, + "grad_norm": 0.4412553310394287, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7663485109806061, + "num_tokens": 1262350131.0, + "step": 4390 + }, + { + "epoch": 1.563846838824577, + "grad_norm": 0.508041262626648, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7663859277963638, + "num_tokens": 1262635498.0, + "step": 4391 + }, + { + "epoch": 1.5642030276046306, + "grad_norm": 0.4933375418186188, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7549046277999878, + "num_tokens": 1262926806.0, + "step": 4392 + }, + { + "epoch": 1.564559216384684, + "grad_norm": 0.5490701794624329, + "learning_rate": 1e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7451098710298538, + "num_tokens": 1263182829.0, + "step": 4393 + }, + { + "epoch": 1.5649154051647374, + "grad_norm": 0.484492689371109, + "learning_rate": 1e-06, + "loss": 0.6514, + "mean_token_accuracy": 0.7880043983459473, + "num_tokens": 1263464042.0, + "step": 4394 + }, + { + "epoch": 1.5652715939447908, + "grad_norm": 0.4686501920223236, + "learning_rate": 1e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7700883597135544, + "num_tokens": 1263769747.0, + "step": 4395 + }, + { + "epoch": 1.5656277827248442, + "grad_norm": 0.4652928411960602, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7472244203090668, + "num_tokens": 1264063226.0, + "step": 4396 + }, + { + "epoch": 1.5659839715048975, + "grad_norm": 0.5156776905059814, + "learning_rate": 1e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7709988504648209, + "num_tokens": 1264350626.0, + "step": 4397 + }, + { + "epoch": 1.566340160284951, + "grad_norm": 0.5206577777862549, + "learning_rate": 1e-06, + "loss": 0.6698, + "mean_token_accuracy": 0.7718081027269363, + "num_tokens": 1264582424.0, + "step": 4398 + }, + { + "epoch": 1.5666963490650043, + "grad_norm": 0.4626554548740387, + "learning_rate": 1e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.7652049362659454, + "num_tokens": 1264859007.0, + "step": 4399 + }, + { + "epoch": 1.567052537845058, + "grad_norm": 0.519769549369812, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7512961626052856, + "num_tokens": 1265129972.0, + "step": 4400 + }, + { + "epoch": 1.5674087266251113, + "grad_norm": 0.48790231347084045, + "learning_rate": 1e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7536615133285522, + "num_tokens": 1265416722.0, + "step": 4401 + }, + { + "epoch": 1.5677649154051647, + "grad_norm": 0.48592662811279297, + "learning_rate": 1e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7751102894544601, + "num_tokens": 1265707629.0, + "step": 4402 + }, + { + "epoch": 1.5681211041852183, + "grad_norm": 0.49308881163597107, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7528223395347595, + "num_tokens": 1266010239.0, + "step": 4403 + }, + { + "epoch": 1.5684772929652717, + "grad_norm": 0.4931827485561371, + "learning_rate": 1e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.751037061214447, + "num_tokens": 1266311253.0, + "step": 4404 + }, + { + "epoch": 1.568833481745325, + "grad_norm": 0.4438512623310089, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7628259658813477, + "num_tokens": 1266609610.0, + "step": 4405 + }, + { + "epoch": 1.5691896705253785, + "grad_norm": 0.49373775720596313, + "learning_rate": 1e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7629365921020508, + "num_tokens": 1266885244.0, + "step": 4406 + }, + { + "epoch": 1.569545859305432, + "grad_norm": 0.5054764151573181, + "learning_rate": 1e-06, + "loss": 0.7504, + "mean_token_accuracy": 0.7607620358467102, + "num_tokens": 1267181549.0, + "step": 4407 + }, + { + "epoch": 1.5699020480854853, + "grad_norm": 0.4703008234500885, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.759808212518692, + "num_tokens": 1267473637.0, + "step": 4408 + }, + { + "epoch": 1.5702582368655387, + "grad_norm": 0.5013121366500854, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7680701911449432, + "num_tokens": 1267728071.0, + "step": 4409 + }, + { + "epoch": 1.570614425645592, + "grad_norm": 0.44709208607673645, + "learning_rate": 1e-06, + "loss": 0.6726, + "mean_token_accuracy": 0.7800592333078384, + "num_tokens": 1268001834.0, + "step": 4410 + }, + { + "epoch": 1.5709706144256455, + "grad_norm": 0.4267944097518921, + "learning_rate": 1e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7693557888269424, + "num_tokens": 1268308487.0, + "step": 4411 + }, + { + "epoch": 1.571326803205699, + "grad_norm": 0.4482806622982025, + "learning_rate": 1e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7526343762874603, + "num_tokens": 1268603653.0, + "step": 4412 + }, + { + "epoch": 1.5716829919857525, + "grad_norm": 0.5267112851142883, + "learning_rate": 1e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7777607589960098, + "num_tokens": 1268859700.0, + "step": 4413 + }, + { + "epoch": 1.5720391807658058, + "grad_norm": 0.4934224486351013, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7594199031591415, + "num_tokens": 1269137636.0, + "step": 4414 + }, + { + "epoch": 1.5723953695458595, + "grad_norm": 0.4978856146335602, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7548218071460724, + "num_tokens": 1269403289.0, + "step": 4415 + }, + { + "epoch": 1.5727515583259128, + "grad_norm": 0.4713229835033417, + "learning_rate": 1e-06, + "loss": 0.7206, + "mean_token_accuracy": 0.7736128866672516, + "num_tokens": 1269673469.0, + "step": 4416 + }, + { + "epoch": 1.5731077471059662, + "grad_norm": 0.4598003327846527, + "learning_rate": 1e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7674265205860138, + "num_tokens": 1269958400.0, + "step": 4417 + }, + { + "epoch": 1.5734639358860196, + "grad_norm": 0.4416792094707489, + "learning_rate": 1e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.76764115691185, + "num_tokens": 1270265786.0, + "step": 4418 + }, + { + "epoch": 1.573820124666073, + "grad_norm": 0.5268278121948242, + "learning_rate": 1e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7665126025676727, + "num_tokens": 1270521798.0, + "step": 4419 + }, + { + "epoch": 1.5741763134461264, + "grad_norm": 0.5265551209449768, + "learning_rate": 1e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.754144549369812, + "num_tokens": 1270815629.0, + "step": 4420 + }, + { + "epoch": 1.5745325022261798, + "grad_norm": 0.4841795563697815, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7595966160297394, + "num_tokens": 1271083799.0, + "step": 4421 + }, + { + "epoch": 1.5748886910062332, + "grad_norm": 0.44275644421577454, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7717117220163345, + "num_tokens": 1271376487.0, + "step": 4422 + }, + { + "epoch": 1.5752448797862866, + "grad_norm": 0.4772619605064392, + "learning_rate": 1e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.764701172709465, + "num_tokens": 1271653315.0, + "step": 4423 + }, + { + "epoch": 1.5756010685663402, + "grad_norm": 0.5434783101081848, + "learning_rate": 1e-06, + "loss": 0.7428, + "mean_token_accuracy": 0.7634943276643753, + "num_tokens": 1271897946.0, + "step": 4424 + }, + { + "epoch": 1.5759572573463936, + "grad_norm": 0.4759472608566284, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7606724798679352, + "num_tokens": 1272166308.0, + "step": 4425 + }, + { + "epoch": 1.576313446126447, + "grad_norm": 0.43411174416542053, + "learning_rate": 1e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7679451406002045, + "num_tokens": 1272474174.0, + "step": 4426 + }, + { + "epoch": 1.5766696349065006, + "grad_norm": 0.5120262503623962, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7644655108451843, + "num_tokens": 1272755157.0, + "step": 4427 + }, + { + "epoch": 1.577025823686554, + "grad_norm": 0.4704815447330475, + "learning_rate": 1e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.78099325299263, + "num_tokens": 1273033996.0, + "step": 4428 + }, + { + "epoch": 1.5773820124666074, + "grad_norm": 0.4624232053756714, + "learning_rate": 1e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.777320608496666, + "num_tokens": 1273291391.0, + "step": 4429 + }, + { + "epoch": 1.5777382012466608, + "grad_norm": 0.42987149953842163, + "learning_rate": 1e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7521787732839584, + "num_tokens": 1273596610.0, + "step": 4430 + }, + { + "epoch": 1.5780943900267141, + "grad_norm": 0.49548837542533875, + "learning_rate": 1e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.7672989815473557, + "num_tokens": 1273828731.0, + "step": 4431 + }, + { + "epoch": 1.5784505788067675, + "grad_norm": 0.4842499792575836, + "learning_rate": 1e-06, + "loss": 0.767, + "mean_token_accuracy": 0.75871641933918, + "num_tokens": 1274085832.0, + "step": 4432 + }, + { + "epoch": 1.578806767586821, + "grad_norm": 0.46369874477386475, + "learning_rate": 1e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7571272552013397, + "num_tokens": 1274380623.0, + "step": 4433 + }, + { + "epoch": 1.5791629563668743, + "grad_norm": 0.5430765748023987, + "learning_rate": 1e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7580690681934357, + "num_tokens": 1274649677.0, + "step": 4434 + }, + { + "epoch": 1.5795191451469277, + "grad_norm": 0.4846119284629822, + "learning_rate": 1e-06, + "loss": 0.79, + "mean_token_accuracy": 0.7480471879243851, + "num_tokens": 1274939231.0, + "step": 4435 + }, + { + "epoch": 1.5798753339269813, + "grad_norm": 0.47981661558151245, + "learning_rate": 1e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7485817819833755, + "num_tokens": 1275230524.0, + "step": 4436 + }, + { + "epoch": 1.5802315227070347, + "grad_norm": 0.48983824253082275, + "learning_rate": 1e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7650132030248642, + "num_tokens": 1275502312.0, + "step": 4437 + }, + { + "epoch": 1.5805877114870883, + "grad_norm": 0.4982505738735199, + "learning_rate": 1e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7602427154779434, + "num_tokens": 1275783714.0, + "step": 4438 + }, + { + "epoch": 1.5809439002671417, + "grad_norm": 0.5181300044059753, + "learning_rate": 1e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7539836019277573, + "num_tokens": 1276060409.0, + "step": 4439 + }, + { + "epoch": 1.581300089047195, + "grad_norm": 0.4522481858730316, + "learning_rate": 1e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.774509847164154, + "num_tokens": 1276327073.0, + "step": 4440 + }, + { + "epoch": 1.5816562778272485, + "grad_norm": 0.44066664576530457, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7711947113275528, + "num_tokens": 1276637420.0, + "step": 4441 + }, + { + "epoch": 1.5820124666073019, + "grad_norm": 0.47812700271606445, + "learning_rate": 1e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7653531283140182, + "num_tokens": 1276901317.0, + "step": 4442 + }, + { + "epoch": 1.5823686553873553, + "grad_norm": 0.5855749249458313, + "learning_rate": 1e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7585025876760483, + "num_tokens": 1277168461.0, + "step": 4443 + }, + { + "epoch": 1.5827248441674087, + "grad_norm": 0.4471776783466339, + "learning_rate": 1e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7794892638921738, + "num_tokens": 1277471810.0, + "step": 4444 + }, + { + "epoch": 1.583081032947462, + "grad_norm": 0.46301448345184326, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7529703378677368, + "num_tokens": 1277772652.0, + "step": 4445 + }, + { + "epoch": 1.5834372217275154, + "grad_norm": 0.443033367395401, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7647593319416046, + "num_tokens": 1278055732.0, + "step": 4446 + }, + { + "epoch": 1.583793410507569, + "grad_norm": 0.4587210416793823, + "learning_rate": 1e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.7724838554859161, + "num_tokens": 1278363668.0, + "step": 4447 + }, + { + "epoch": 1.5841495992876224, + "grad_norm": 0.45454320311546326, + "learning_rate": 1e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7654496729373932, + "num_tokens": 1278640651.0, + "step": 4448 + }, + { + "epoch": 1.5845057880676758, + "grad_norm": 0.4902423918247223, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7676506489515305, + "num_tokens": 1278920573.0, + "step": 4449 + }, + { + "epoch": 1.5848619768477294, + "grad_norm": 0.4807342290878296, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7444210648536682, + "num_tokens": 1279226746.0, + "step": 4450 + }, + { + "epoch": 1.5852181656277828, + "grad_norm": 0.43684300780296326, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7678870856761932, + "num_tokens": 1279522854.0, + "step": 4451 + }, + { + "epoch": 1.5855743544078362, + "grad_norm": 0.4963440001010895, + "learning_rate": 1e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7632872313261032, + "num_tokens": 1279815832.0, + "step": 4452 + }, + { + "epoch": 1.5859305431878896, + "grad_norm": 0.4745043218135834, + "learning_rate": 1e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7545990496873856, + "num_tokens": 1280101737.0, + "step": 4453 + }, + { + "epoch": 1.586286731967943, + "grad_norm": 0.45628854632377625, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7725084871053696, + "num_tokens": 1280395499.0, + "step": 4454 + }, + { + "epoch": 1.5866429207479964, + "grad_norm": 0.48025786876678467, + "learning_rate": 1e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7588109076023102, + "num_tokens": 1280661612.0, + "step": 4455 + }, + { + "epoch": 1.5869991095280498, + "grad_norm": 0.5003387331962585, + "learning_rate": 1e-06, + "loss": 0.6687, + "mean_token_accuracy": 0.7836640030145645, + "num_tokens": 1280954489.0, + "step": 4456 + }, + { + "epoch": 1.5873552983081032, + "grad_norm": 0.47034427523612976, + "learning_rate": 1e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7712162435054779, + "num_tokens": 1281227801.0, + "step": 4457 + }, + { + "epoch": 1.5877114870881566, + "grad_norm": 0.4727645516395569, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7698522657155991, + "num_tokens": 1281516643.0, + "step": 4458 + }, + { + "epoch": 1.5880676758682102, + "grad_norm": 0.5008234977722168, + "learning_rate": 1e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.779044970870018, + "num_tokens": 1281823598.0, + "step": 4459 + }, + { + "epoch": 1.5884238646482636, + "grad_norm": 0.48372167348861694, + "learning_rate": 1e-06, + "loss": 0.679, + "mean_token_accuracy": 0.7781163156032562, + "num_tokens": 1282092452.0, + "step": 4460 + }, + { + "epoch": 1.588780053428317, + "grad_norm": 0.49467504024505615, + "learning_rate": 1e-06, + "loss": 0.778, + "mean_token_accuracy": 0.756243109703064, + "num_tokens": 1282361972.0, + "step": 4461 + }, + { + "epoch": 1.5891362422083706, + "grad_norm": 0.46052420139312744, + "learning_rate": 1e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7674231678247452, + "num_tokens": 1282665451.0, + "step": 4462 + }, + { + "epoch": 1.589492430988424, + "grad_norm": 0.45880016684532166, + "learning_rate": 1e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7583940327167511, + "num_tokens": 1282912761.0, + "step": 4463 + }, + { + "epoch": 1.5898486197684774, + "grad_norm": 0.49968165159225464, + "learning_rate": 1e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7541870027780533, + "num_tokens": 1283222654.0, + "step": 4464 + }, + { + "epoch": 1.5902048085485307, + "grad_norm": 0.48481446504592896, + "learning_rate": 1e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.763387605547905, + "num_tokens": 1283497757.0, + "step": 4465 + }, + { + "epoch": 1.5905609973285841, + "grad_norm": 0.5428211092948914, + "learning_rate": 1e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7668834626674652, + "num_tokens": 1283744686.0, + "step": 4466 + }, + { + "epoch": 1.5909171861086375, + "grad_norm": 0.5492246747016907, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7281347066164017, + "num_tokens": 1284013432.0, + "step": 4467 + }, + { + "epoch": 1.591273374888691, + "grad_norm": 0.4191490113735199, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7701003849506378, + "num_tokens": 1284300842.0, + "step": 4468 + }, + { + "epoch": 1.5916295636687443, + "grad_norm": 0.4843599200248718, + "learning_rate": 1e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7726950794458389, + "num_tokens": 1284572223.0, + "step": 4469 + }, + { + "epoch": 1.5919857524487977, + "grad_norm": 0.4816981256008148, + "learning_rate": 1e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7646863907575607, + "num_tokens": 1284868952.0, + "step": 4470 + }, + { + "epoch": 1.5923419412288513, + "grad_norm": 0.405632883310318, + "learning_rate": 1e-06, + "loss": 0.659, + "mean_token_accuracy": 0.7916213124990463, + "num_tokens": 1285183174.0, + "step": 4471 + }, + { + "epoch": 1.5926981300089047, + "grad_norm": 0.48411867022514343, + "learning_rate": 1e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7696869373321533, + "num_tokens": 1285480678.0, + "step": 4472 + }, + { + "epoch": 1.5930543187889583, + "grad_norm": 0.4808294177055359, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.769049346446991, + "num_tokens": 1285762342.0, + "step": 4473 + }, + { + "epoch": 1.5934105075690117, + "grad_norm": 0.4812617599964142, + "learning_rate": 1e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7671254277229309, + "num_tokens": 1286046646.0, + "step": 4474 + }, + { + "epoch": 1.593766696349065, + "grad_norm": 0.4641973078250885, + "learning_rate": 1e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7614549845457077, + "num_tokens": 1286342444.0, + "step": 4475 + }, + { + "epoch": 1.5941228851291185, + "grad_norm": 0.4699735939502716, + "learning_rate": 1e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7567323297262192, + "num_tokens": 1286604599.0, + "step": 4476 + }, + { + "epoch": 1.5944790739091719, + "grad_norm": 0.4314265847206116, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7726818323135376, + "num_tokens": 1286929166.0, + "step": 4477 + }, + { + "epoch": 1.5948352626892253, + "grad_norm": 0.4928852319717407, + "learning_rate": 1e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7657692432403564, + "num_tokens": 1287221912.0, + "step": 4478 + }, + { + "epoch": 1.5951914514692787, + "grad_norm": 0.5115230679512024, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7688692361116409, + "num_tokens": 1287505756.0, + "step": 4479 + }, + { + "epoch": 1.595547640249332, + "grad_norm": 0.4665304124355316, + "learning_rate": 1e-06, + "loss": 0.6528, + "mean_token_accuracy": 0.7866641134023666, + "num_tokens": 1287787685.0, + "step": 4480 + }, + { + "epoch": 1.5959038290293854, + "grad_norm": 0.447704553604126, + "learning_rate": 1e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7712603807449341, + "num_tokens": 1288057920.0, + "step": 4481 + }, + { + "epoch": 1.596260017809439, + "grad_norm": 0.4453437626361847, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7651358246803284, + "num_tokens": 1288349127.0, + "step": 4482 + }, + { + "epoch": 1.5966162065894924, + "grad_norm": 0.48000940680503845, + "learning_rate": 1e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7680118829011917, + "num_tokens": 1288620532.0, + "step": 4483 + }, + { + "epoch": 1.5969723953695458, + "grad_norm": 0.43511825799942017, + "learning_rate": 1e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7572760134935379, + "num_tokens": 1288925784.0, + "step": 4484 + }, + { + "epoch": 1.5973285841495994, + "grad_norm": 0.49864068627357483, + "learning_rate": 1e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7621950507164001, + "num_tokens": 1289205773.0, + "step": 4485 + }, + { + "epoch": 1.5976847729296528, + "grad_norm": 0.4878060817718506, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.758368581533432, + "num_tokens": 1289508227.0, + "step": 4486 + }, + { + "epoch": 1.5980409617097062, + "grad_norm": 0.44719380140304565, + "learning_rate": 1e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7573172152042389, + "num_tokens": 1289810621.0, + "step": 4487 + }, + { + "epoch": 1.5983971504897596, + "grad_norm": 0.4949771463871002, + "learning_rate": 1e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.7779068052768707, + "num_tokens": 1290094497.0, + "step": 4488 + }, + { + "epoch": 1.598753339269813, + "grad_norm": 0.4401043653488159, + "learning_rate": 1e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.779817596077919, + "num_tokens": 1290405762.0, + "step": 4489 + }, + { + "epoch": 1.5991095280498664, + "grad_norm": 0.4475487470626831, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7668684273958206, + "num_tokens": 1290685726.0, + "step": 4490 + }, + { + "epoch": 1.5994657168299198, + "grad_norm": 0.49836450815200806, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7663760632276535, + "num_tokens": 1290965794.0, + "step": 4491 + }, + { + "epoch": 1.5998219056099732, + "grad_norm": 0.49509912729263306, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.7611704468727112, + "num_tokens": 1291248959.0, + "step": 4492 + }, + { + "epoch": 1.6001780943900266, + "grad_norm": 0.48199543356895447, + "learning_rate": 1e-06, + "loss": 0.7169, + "mean_token_accuracy": 0.7722212076187134, + "num_tokens": 1291521109.0, + "step": 4493 + }, + { + "epoch": 1.6005342831700802, + "grad_norm": 0.4681381285190582, + "learning_rate": 1e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7699057757854462, + "num_tokens": 1291807244.0, + "step": 4494 + }, + { + "epoch": 1.6008904719501336, + "grad_norm": 0.45951828360557556, + "learning_rate": 1e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7696851193904877, + "num_tokens": 1292112703.0, + "step": 4495 + }, + { + "epoch": 1.601246660730187, + "grad_norm": 0.46030327677726746, + "learning_rate": 1e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7609318196773529, + "num_tokens": 1292399305.0, + "step": 4496 + }, + { + "epoch": 1.6016028495102406, + "grad_norm": 0.4304859936237335, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7755509912967682, + "num_tokens": 1292735323.0, + "step": 4497 + }, + { + "epoch": 1.601959038290294, + "grad_norm": 0.46349379420280457, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7628264278173447, + "num_tokens": 1293047732.0, + "step": 4498 + }, + { + "epoch": 1.6023152270703473, + "grad_norm": 0.4746135473251343, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7676612436771393, + "num_tokens": 1293355142.0, + "step": 4499 + }, + { + "epoch": 1.6026714158504007, + "grad_norm": 0.4263843595981598, + "learning_rate": 1e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7607451230287552, + "num_tokens": 1293644155.0, + "step": 4500 + }, + { + "epoch": 1.6030276046304541, + "grad_norm": 0.4570733606815338, + "learning_rate": 1e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7748585045337677, + "num_tokens": 1293896665.0, + "step": 4501 + }, + { + "epoch": 1.6033837934105075, + "grad_norm": 0.505861222743988, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7614277899265289, + "num_tokens": 1294180081.0, + "step": 4502 + }, + { + "epoch": 1.603739982190561, + "grad_norm": 0.4752318561077118, + "learning_rate": 1e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7676278352737427, + "num_tokens": 1294476486.0, + "step": 4503 + }, + { + "epoch": 1.6040961709706143, + "grad_norm": 0.46789830923080444, + "learning_rate": 1e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7530785948038101, + "num_tokens": 1294764340.0, + "step": 4504 + }, + { + "epoch": 1.6044523597506677, + "grad_norm": 0.482755571603775, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7625394314527512, + "num_tokens": 1295045912.0, + "step": 4505 + }, + { + "epoch": 1.6048085485307213, + "grad_norm": 0.46165981888771057, + "learning_rate": 1e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7723526507616043, + "num_tokens": 1295326708.0, + "step": 4506 + }, + { + "epoch": 1.6051647373107747, + "grad_norm": 0.48767292499542236, + "learning_rate": 1e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7597102373838425, + "num_tokens": 1295621691.0, + "step": 4507 + }, + { + "epoch": 1.6055209260908283, + "grad_norm": 0.47601282596588135, + "learning_rate": 1e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.7659623473882675, + "num_tokens": 1295918494.0, + "step": 4508 + }, + { + "epoch": 1.6058771148708817, + "grad_norm": 0.437569260597229, + "learning_rate": 1e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7768766433000565, + "num_tokens": 1296226521.0, + "step": 4509 + }, + { + "epoch": 1.606233303650935, + "grad_norm": 0.4225585162639618, + "learning_rate": 1e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.7759215533733368, + "num_tokens": 1296530309.0, + "step": 4510 + }, + { + "epoch": 1.6065894924309885, + "grad_norm": 0.44343143701553345, + "learning_rate": 1e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7718220949172974, + "num_tokens": 1296843354.0, + "step": 4511 + }, + { + "epoch": 1.6069456812110419, + "grad_norm": 0.5089576244354248, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7501575797796249, + "num_tokens": 1297094402.0, + "step": 4512 + }, + { + "epoch": 1.6073018699910953, + "grad_norm": 0.4802210330963135, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7655789703130722, + "num_tokens": 1297386198.0, + "step": 4513 + }, + { + "epoch": 1.6076580587711486, + "grad_norm": 0.4521753191947937, + "learning_rate": 1e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.7713636606931686, + "num_tokens": 1297690719.0, + "step": 4514 + }, + { + "epoch": 1.608014247551202, + "grad_norm": 0.48112860321998596, + "learning_rate": 1e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7661131769418716, + "num_tokens": 1297949316.0, + "step": 4515 + }, + { + "epoch": 1.6083704363312554, + "grad_norm": 0.44099411368370056, + "learning_rate": 1e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7780199199914932, + "num_tokens": 1298229133.0, + "step": 4516 + }, + { + "epoch": 1.608726625111309, + "grad_norm": 0.48382729291915894, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7655088007450104, + "num_tokens": 1298500844.0, + "step": 4517 + }, + { + "epoch": 1.6090828138913624, + "grad_norm": 0.49571704864501953, + "learning_rate": 1e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.7510537356138229, + "num_tokens": 1298787184.0, + "step": 4518 + }, + { + "epoch": 1.6094390026714158, + "grad_norm": 0.5284031629562378, + "learning_rate": 1e-06, + "loss": 0.7781, + "mean_token_accuracy": 0.7528073042631149, + "num_tokens": 1299063062.0, + "step": 4519 + }, + { + "epoch": 1.6097951914514694, + "grad_norm": 0.45494797825813293, + "learning_rate": 1e-06, + "loss": 0.6922, + "mean_token_accuracy": 0.778063639998436, + "num_tokens": 1299385476.0, + "step": 4520 + }, + { + "epoch": 1.6101513802315228, + "grad_norm": 0.4508303105831146, + "learning_rate": 1e-06, + "loss": 0.683, + "mean_token_accuracy": 0.7848753780126572, + "num_tokens": 1299690261.0, + "step": 4521 + }, + { + "epoch": 1.6105075690115762, + "grad_norm": 0.5075223445892334, + "learning_rate": 1e-06, + "loss": 0.6766, + "mean_token_accuracy": 0.7744549065828323, + "num_tokens": 1299977604.0, + "step": 4522 + }, + { + "epoch": 1.6108637577916296, + "grad_norm": 0.47543585300445557, + "learning_rate": 1e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.7783253341913223, + "num_tokens": 1300288615.0, + "step": 4523 + }, + { + "epoch": 1.611219946571683, + "grad_norm": 0.47392475605010986, + "learning_rate": 1e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7764818370342255, + "num_tokens": 1300570597.0, + "step": 4524 + }, + { + "epoch": 1.6115761353517364, + "grad_norm": 0.4777989089488983, + "learning_rate": 1e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.7777366489171982, + "num_tokens": 1300872750.0, + "step": 4525 + }, + { + "epoch": 1.6119323241317898, + "grad_norm": 0.44255968928337097, + "learning_rate": 1e-06, + "loss": 0.6799, + "mean_token_accuracy": 0.7785724997520447, + "num_tokens": 1301119659.0, + "step": 4526 + }, + { + "epoch": 1.6122885129118432, + "grad_norm": 0.4765239655971527, + "learning_rate": 1e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.7537539899349213, + "num_tokens": 1301403586.0, + "step": 4527 + }, + { + "epoch": 1.6126447016918966, + "grad_norm": 0.523419976234436, + "learning_rate": 1e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.776353195309639, + "num_tokens": 1301666307.0, + "step": 4528 + }, + { + "epoch": 1.6130008904719502, + "grad_norm": 0.4435690939426422, + "learning_rate": 1e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7562123090028763, + "num_tokens": 1301939285.0, + "step": 4529 + }, + { + "epoch": 1.6133570792520036, + "grad_norm": 0.43895378708839417, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7651820778846741, + "num_tokens": 1302242609.0, + "step": 4530 + }, + { + "epoch": 1.613713268032057, + "grad_norm": 0.43968841433525085, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7674042582511902, + "num_tokens": 1302527802.0, + "step": 4531 + }, + { + "epoch": 1.6140694568121106, + "grad_norm": 0.4357070028781891, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7644886821508408, + "num_tokens": 1302830613.0, + "step": 4532 + }, + { + "epoch": 1.614425645592164, + "grad_norm": 0.47452816367149353, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7669912725687027, + "num_tokens": 1303079850.0, + "step": 4533 + }, + { + "epoch": 1.6147818343722173, + "grad_norm": 0.4611712694168091, + "learning_rate": 1e-06, + "loss": 0.6172, + "mean_token_accuracy": 0.7960197180509567, + "num_tokens": 1303384690.0, + "step": 4534 + }, + { + "epoch": 1.6151380231522707, + "grad_norm": 0.5323165059089661, + "learning_rate": 1e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7711980491876602, + "num_tokens": 1303642997.0, + "step": 4535 + }, + { + "epoch": 1.6154942119323241, + "grad_norm": 0.4239034056663513, + "learning_rate": 1e-06, + "loss": 0.6483, + "mean_token_accuracy": 0.7896718531847, + "num_tokens": 1303921094.0, + "step": 4536 + }, + { + "epoch": 1.6158504007123775, + "grad_norm": 0.4258495271205902, + "learning_rate": 1e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7754814028739929, + "num_tokens": 1304217951.0, + "step": 4537 + }, + { + "epoch": 1.616206589492431, + "grad_norm": 0.4232921898365021, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7507994621992111, + "num_tokens": 1304494170.0, + "step": 4538 + }, + { + "epoch": 1.6165627782724843, + "grad_norm": 0.4745355546474457, + "learning_rate": 1e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7621996998786926, + "num_tokens": 1304761467.0, + "step": 4539 + }, + { + "epoch": 1.6169189670525377, + "grad_norm": 0.4519432485103607, + "learning_rate": 1e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7520012408494949, + "num_tokens": 1305061937.0, + "step": 4540 + }, + { + "epoch": 1.6172751558325913, + "grad_norm": 0.5347123146057129, + "learning_rate": 1e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7712372988462448, + "num_tokens": 1305298735.0, + "step": 4541 + }, + { + "epoch": 1.6176313446126447, + "grad_norm": 0.4935384690761566, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7601161450147629, + "num_tokens": 1305564712.0, + "step": 4542 + }, + { + "epoch": 1.6179875333926983, + "grad_norm": 0.4356458783149719, + "learning_rate": 1e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7649129927158356, + "num_tokens": 1305871577.0, + "step": 4543 + }, + { + "epoch": 1.6183437221727517, + "grad_norm": 0.5043309926986694, + "learning_rate": 1e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7596884220838547, + "num_tokens": 1306132007.0, + "step": 4544 + }, + { + "epoch": 1.618699910952805, + "grad_norm": 0.4530723989009857, + "learning_rate": 1e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7544848918914795, + "num_tokens": 1306425276.0, + "step": 4545 + }, + { + "epoch": 1.6190560997328585, + "grad_norm": 0.48147520422935486, + "learning_rate": 1e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7741963267326355, + "num_tokens": 1306688065.0, + "step": 4546 + }, + { + "epoch": 1.6194122885129119, + "grad_norm": 0.47809064388275146, + "learning_rate": 1e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7732081115245819, + "num_tokens": 1306959181.0, + "step": 4547 + }, + { + "epoch": 1.6197684772929652, + "grad_norm": 0.4325096607208252, + "learning_rate": 1e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.7757893353700638, + "num_tokens": 1307262364.0, + "step": 4548 + }, + { + "epoch": 1.6201246660730186, + "grad_norm": 0.5093919634819031, + "learning_rate": 1e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.7599758207798004, + "num_tokens": 1307559055.0, + "step": 4549 + }, + { + "epoch": 1.620480854853072, + "grad_norm": 0.4978332817554474, + "learning_rate": 1e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7683099508285522, + "num_tokens": 1307840516.0, + "step": 4550 + }, + { + "epoch": 1.6208370436331254, + "grad_norm": 0.4654041826725006, + "learning_rate": 1e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.7754700183868408, + "num_tokens": 1308127744.0, + "step": 4551 + }, + { + "epoch": 1.621193232413179, + "grad_norm": 0.4134148061275482, + "learning_rate": 1e-06, + "loss": 0.7747, + "mean_token_accuracy": 0.7512146979570389, + "num_tokens": 1308429900.0, + "step": 4552 + }, + { + "epoch": 1.6215494211932324, + "grad_norm": 0.4173249304294586, + "learning_rate": 1e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.7811898440122604, + "num_tokens": 1308760149.0, + "step": 4553 + }, + { + "epoch": 1.6219056099732858, + "grad_norm": 0.49763137102127075, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7669223994016647, + "num_tokens": 1309048438.0, + "step": 4554 + }, + { + "epoch": 1.6222617987533394, + "grad_norm": 0.4090615510940552, + "learning_rate": 1e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7690142840147018, + "num_tokens": 1309370573.0, + "step": 4555 + }, + { + "epoch": 1.6226179875333928, + "grad_norm": 0.45674794912338257, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7522246092557907, + "num_tokens": 1309678263.0, + "step": 4556 + }, + { + "epoch": 1.6229741763134462, + "grad_norm": 0.5442628860473633, + "learning_rate": 1e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7705781161785126, + "num_tokens": 1309935620.0, + "step": 4557 + }, + { + "epoch": 1.6233303650934996, + "grad_norm": 0.4645960032939911, + "learning_rate": 1e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7632615715265274, + "num_tokens": 1310197333.0, + "step": 4558 + }, + { + "epoch": 1.623686553873553, + "grad_norm": 0.43962910771369934, + "learning_rate": 1e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.78669373691082, + "num_tokens": 1310468986.0, + "step": 4559 + }, + { + "epoch": 1.6240427426536064, + "grad_norm": 0.5280665755271912, + "learning_rate": 1e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7617897689342499, + "num_tokens": 1310742836.0, + "step": 4560 + }, + { + "epoch": 1.6243989314336598, + "grad_norm": 0.4839479625225067, + "learning_rate": 1e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.7862700521945953, + "num_tokens": 1311034078.0, + "step": 4561 + }, + { + "epoch": 1.6247551202137132, + "grad_norm": 0.48764950037002563, + "learning_rate": 1e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.7786372303962708, + "num_tokens": 1311310563.0, + "step": 4562 + }, + { + "epoch": 1.6251113089937665, + "grad_norm": 0.5004125833511353, + "learning_rate": 1e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7749398052692413, + "num_tokens": 1311584382.0, + "step": 4563 + }, + { + "epoch": 1.6254674977738202, + "grad_norm": 0.5232874155044556, + "learning_rate": 1e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7633287608623505, + "num_tokens": 1311855157.0, + "step": 4564 + }, + { + "epoch": 1.6258236865538735, + "grad_norm": 0.4792744219303131, + "learning_rate": 1e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7599530965089798, + "num_tokens": 1312131318.0, + "step": 4565 + }, + { + "epoch": 1.626179875333927, + "grad_norm": 0.4956592321395874, + "learning_rate": 1e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7474230080842972, + "num_tokens": 1312410388.0, + "step": 4566 + }, + { + "epoch": 1.6265360641139806, + "grad_norm": 0.4705141484737396, + "learning_rate": 1e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.7737419009208679, + "num_tokens": 1312687040.0, + "step": 4567 + }, + { + "epoch": 1.626892252894034, + "grad_norm": 0.484636127948761, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.765994980931282, + "num_tokens": 1312971782.0, + "step": 4568 + }, + { + "epoch": 1.6272484416740873, + "grad_norm": 0.4599413573741913, + "learning_rate": 1e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7664586454629898, + "num_tokens": 1313267080.0, + "step": 4569 + }, + { + "epoch": 1.6276046304541407, + "grad_norm": 0.4741330146789551, + "learning_rate": 1e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7796279191970825, + "num_tokens": 1313550878.0, + "step": 4570 + }, + { + "epoch": 1.6279608192341941, + "grad_norm": 0.444430410861969, + "learning_rate": 1e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.74732506275177, + "num_tokens": 1313852707.0, + "step": 4571 + }, + { + "epoch": 1.6283170080142475, + "grad_norm": 0.42419397830963135, + "learning_rate": 1e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7769964933395386, + "num_tokens": 1314194866.0, + "step": 4572 + }, + { + "epoch": 1.628673196794301, + "grad_norm": 0.4569360613822937, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7673059105873108, + "num_tokens": 1314510435.0, + "step": 4573 + }, + { + "epoch": 1.6290293855743543, + "grad_norm": 0.5066809058189392, + "learning_rate": 1e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7619811594486237, + "num_tokens": 1314811463.0, + "step": 4574 + }, + { + "epoch": 1.6293855743544077, + "grad_norm": 0.48372313380241394, + "learning_rate": 1e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7622537165880203, + "num_tokens": 1315104122.0, + "step": 4575 + }, + { + "epoch": 1.6297417631344613, + "grad_norm": 0.49244916439056396, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7684203088283539, + "num_tokens": 1315384344.0, + "step": 4576 + }, + { + "epoch": 1.6300979519145147, + "grad_norm": 0.4629954695701599, + "learning_rate": 1e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7750529795885086, + "num_tokens": 1315682865.0, + "step": 4577 + }, + { + "epoch": 1.630454140694568, + "grad_norm": 0.44910499453544617, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7701127380132675, + "num_tokens": 1315935562.0, + "step": 4578 + }, + { + "epoch": 1.6308103294746217, + "grad_norm": 0.4890722334384918, + "learning_rate": 1e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7764822840690613, + "num_tokens": 1316219445.0, + "step": 4579 + }, + { + "epoch": 1.631166518254675, + "grad_norm": 0.4136064052581787, + "learning_rate": 1e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7598975449800491, + "num_tokens": 1316507583.0, + "step": 4580 + }, + { + "epoch": 1.6315227070347285, + "grad_norm": 0.4963957965373993, + "learning_rate": 1e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7742351144552231, + "num_tokens": 1316763109.0, + "step": 4581 + }, + { + "epoch": 1.6318788958147818, + "grad_norm": 0.4954244792461395, + "learning_rate": 1e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.7729615718126297, + "num_tokens": 1317049424.0, + "step": 4582 + }, + { + "epoch": 1.6322350845948352, + "grad_norm": 0.4795924127101898, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7543935626745224, + "num_tokens": 1317313704.0, + "step": 4583 + }, + { + "epoch": 1.6325912733748886, + "grad_norm": 0.4819391965866089, + "learning_rate": 1e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7671210318803787, + "num_tokens": 1317615512.0, + "step": 4584 + }, + { + "epoch": 1.632947462154942, + "grad_norm": 0.4725055992603302, + "learning_rate": 1e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7808280140161514, + "num_tokens": 1317907281.0, + "step": 4585 + }, + { + "epoch": 1.6333036509349954, + "grad_norm": 0.4612849950790405, + "learning_rate": 1e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.7763327062129974, + "num_tokens": 1318213389.0, + "step": 4586 + }, + { + "epoch": 1.633659839715049, + "grad_norm": 0.44041359424591064, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7552226781845093, + "num_tokens": 1318532504.0, + "step": 4587 + }, + { + "epoch": 1.6340160284951024, + "grad_norm": 0.513221800327301, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.746111124753952, + "num_tokens": 1318779302.0, + "step": 4588 + }, + { + "epoch": 1.6343722172751558, + "grad_norm": 0.4808751046657562, + "learning_rate": 1e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.77176333963871, + "num_tokens": 1319060358.0, + "step": 4589 + }, + { + "epoch": 1.6347284060552094, + "grad_norm": 0.45184051990509033, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7587984055280685, + "num_tokens": 1319349604.0, + "step": 4590 + }, + { + "epoch": 1.6350845948352628, + "grad_norm": 0.46179667115211487, + "learning_rate": 1e-06, + "loss": 0.6886, + "mean_token_accuracy": 0.7738004624843597, + "num_tokens": 1319608129.0, + "step": 4591 + }, + { + "epoch": 1.6354407836153162, + "grad_norm": 0.43585795164108276, + "learning_rate": 1e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7817210406064987, + "num_tokens": 1319915036.0, + "step": 4592 + }, + { + "epoch": 1.6357969723953696, + "grad_norm": 0.4342036247253418, + "learning_rate": 1e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7582067102193832, + "num_tokens": 1320222088.0, + "step": 4593 + }, + { + "epoch": 1.636153161175423, + "grad_norm": 0.4812197685241699, + "learning_rate": 1e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7613147497177124, + "num_tokens": 1320524810.0, + "step": 4594 + }, + { + "epoch": 1.6365093499554764, + "grad_norm": 0.510168194770813, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7577378004789352, + "num_tokens": 1320794734.0, + "step": 4595 + }, + { + "epoch": 1.6368655387355298, + "grad_norm": 0.521270751953125, + "learning_rate": 1e-06, + "loss": 0.7428, + "mean_token_accuracy": 0.7680496722459793, + "num_tokens": 1321058443.0, + "step": 4596 + }, + { + "epoch": 1.6372217275155831, + "grad_norm": 0.501264750957489, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7623269855976105, + "num_tokens": 1321341978.0, + "step": 4597 + }, + { + "epoch": 1.6375779162956365, + "grad_norm": 0.46834567189216614, + "learning_rate": 1e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7804100960493088, + "num_tokens": 1321650521.0, + "step": 4598 + }, + { + "epoch": 1.6379341050756901, + "grad_norm": 0.5016553401947021, + "learning_rate": 1e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7542989253997803, + "num_tokens": 1321906415.0, + "step": 4599 + }, + { + "epoch": 1.6382902938557435, + "grad_norm": 0.492291659116745, + "learning_rate": 1e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7449547797441483, + "num_tokens": 1322218151.0, + "step": 4600 + }, + { + "epoch": 1.638646482635797, + "grad_norm": 0.5090184807777405, + "learning_rate": 1e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.751419797539711, + "num_tokens": 1322526699.0, + "step": 4601 + }, + { + "epoch": 1.6390026714158505, + "grad_norm": 0.4489775002002716, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7658409923315048, + "num_tokens": 1322826443.0, + "step": 4602 + }, + { + "epoch": 1.639358860195904, + "grad_norm": 0.45549091696739197, + "learning_rate": 1e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.774152547121048, + "num_tokens": 1323112634.0, + "step": 4603 + }, + { + "epoch": 1.6397150489759573, + "grad_norm": 0.4474446475505829, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7583097815513611, + "num_tokens": 1323405305.0, + "step": 4604 + }, + { + "epoch": 1.6400712377560107, + "grad_norm": 0.473428875207901, + "learning_rate": 1e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7680543661117554, + "num_tokens": 1323707508.0, + "step": 4605 + }, + { + "epoch": 1.640427426536064, + "grad_norm": 0.47822439670562744, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7567806094884872, + "num_tokens": 1323974550.0, + "step": 4606 + }, + { + "epoch": 1.6407836153161175, + "grad_norm": 0.4912404417991638, + "learning_rate": 1e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.7558082789182663, + "num_tokens": 1324298409.0, + "step": 4607 + }, + { + "epoch": 1.6411398040961709, + "grad_norm": 0.47922560572624207, + "learning_rate": 1e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.7698772996664047, + "num_tokens": 1324564799.0, + "step": 4608 + }, + { + "epoch": 1.6414959928762243, + "grad_norm": 0.4648036062717438, + "learning_rate": 1e-06, + "loss": 0.6739, + "mean_token_accuracy": 0.7800932675600052, + "num_tokens": 1324884347.0, + "step": 4609 + }, + { + "epoch": 1.6418521816562777, + "grad_norm": 0.43416333198547363, + "learning_rate": 1e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7709869146347046, + "num_tokens": 1325203842.0, + "step": 4610 + }, + { + "epoch": 1.6422083704363313, + "grad_norm": 0.4744240939617157, + "learning_rate": 1e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7469295859336853, + "num_tokens": 1325482510.0, + "step": 4611 + }, + { + "epoch": 1.6425645592163847, + "grad_norm": 0.47176289558410645, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7642024159431458, + "num_tokens": 1325770987.0, + "step": 4612 + }, + { + "epoch": 1.642920747996438, + "grad_norm": 0.5056926012039185, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7669447660446167, + "num_tokens": 1326036524.0, + "step": 4613 + }, + { + "epoch": 1.6432769367764917, + "grad_norm": 0.48398303985595703, + "learning_rate": 1e-06, + "loss": 0.6721, + "mean_token_accuracy": 0.784224659204483, + "num_tokens": 1326323005.0, + "step": 4614 + }, + { + "epoch": 1.643633125556545, + "grad_norm": 0.4678812623023987, + "learning_rate": 1e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7531286478042603, + "num_tokens": 1326632736.0, + "step": 4615 + }, + { + "epoch": 1.6439893143365985, + "grad_norm": 0.4366300404071808, + "learning_rate": 1e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7658666968345642, + "num_tokens": 1326954568.0, + "step": 4616 + }, + { + "epoch": 1.6443455031166518, + "grad_norm": 0.4619133770465851, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7574133574962616, + "num_tokens": 1327266101.0, + "step": 4617 + }, + { + "epoch": 1.6447016918967052, + "grad_norm": 0.4556916058063507, + "learning_rate": 1e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7777708768844604, + "num_tokens": 1327538057.0, + "step": 4618 + }, + { + "epoch": 1.6450578806767586, + "grad_norm": 0.4930175542831421, + "learning_rate": 1e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7677261084318161, + "num_tokens": 1327811619.0, + "step": 4619 + }, + { + "epoch": 1.645414069456812, + "grad_norm": 0.47690245509147644, + "learning_rate": 1e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.767838716506958, + "num_tokens": 1328107580.0, + "step": 4620 + }, + { + "epoch": 1.6457702582368654, + "grad_norm": 0.4918402135372162, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7833388894796371, + "num_tokens": 1328398928.0, + "step": 4621 + }, + { + "epoch": 1.646126447016919, + "grad_norm": 0.4506984353065491, + "learning_rate": 1e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7481767535209656, + "num_tokens": 1328664848.0, + "step": 4622 + }, + { + "epoch": 1.6464826357969724, + "grad_norm": 0.48060673475265503, + "learning_rate": 1e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7634409368038177, + "num_tokens": 1328918134.0, + "step": 4623 + }, + { + "epoch": 1.6468388245770258, + "grad_norm": 0.46705907583236694, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7575301378965378, + "num_tokens": 1329209683.0, + "step": 4624 + }, + { + "epoch": 1.6471950133570794, + "grad_norm": 0.45582443475723267, + "learning_rate": 1e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.768078163266182, + "num_tokens": 1329478421.0, + "step": 4625 + }, + { + "epoch": 1.6475512021371328, + "grad_norm": 0.45645666122436523, + "learning_rate": 1e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.7819383144378662, + "num_tokens": 1329786580.0, + "step": 4626 + }, + { + "epoch": 1.6479073909171862, + "grad_norm": 0.4568172097206116, + "learning_rate": 1e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.7707422375679016, + "num_tokens": 1330077100.0, + "step": 4627 + }, + { + "epoch": 1.6482635796972396, + "grad_norm": 0.44023969769477844, + "learning_rate": 1e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7690966576337814, + "num_tokens": 1330394248.0, + "step": 4628 + }, + { + "epoch": 1.648619768477293, + "grad_norm": 0.433531254529953, + "learning_rate": 1e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7577113062143326, + "num_tokens": 1330714793.0, + "step": 4629 + }, + { + "epoch": 1.6489759572573464, + "grad_norm": 0.46727028489112854, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7676476687192917, + "num_tokens": 1330989708.0, + "step": 4630 + }, + { + "epoch": 1.6493321460373997, + "grad_norm": 0.454000860452652, + "learning_rate": 1e-06, + "loss": 0.6771, + "mean_token_accuracy": 0.7854345887899399, + "num_tokens": 1331287721.0, + "step": 4631 + }, + { + "epoch": 1.6496883348174531, + "grad_norm": 0.5198310613632202, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7673302590847015, + "num_tokens": 1331553259.0, + "step": 4632 + }, + { + "epoch": 1.6500445235975065, + "grad_norm": 0.4694227874279022, + "learning_rate": 1e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.7683328241109848, + "num_tokens": 1331855945.0, + "step": 4633 + }, + { + "epoch": 1.6504007123775601, + "grad_norm": 0.44573983550071716, + "learning_rate": 1e-06, + "loss": 0.699, + "mean_token_accuracy": 0.7790866792201996, + "num_tokens": 1332156534.0, + "step": 4634 + }, + { + "epoch": 1.6507569011576135, + "grad_norm": 0.4759749472141266, + "learning_rate": 1e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7682500779628754, + "num_tokens": 1332473792.0, + "step": 4635 + }, + { + "epoch": 1.651113089937667, + "grad_norm": 0.5079384446144104, + "learning_rate": 1e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7630749046802521, + "num_tokens": 1332725425.0, + "step": 4636 + }, + { + "epoch": 1.6514692787177205, + "grad_norm": 0.4772580564022064, + "learning_rate": 1e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.7707013785839081, + "num_tokens": 1333023014.0, + "step": 4637 + }, + { + "epoch": 1.651825467497774, + "grad_norm": 0.43413329124450684, + "learning_rate": 1e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.7776035219430923, + "num_tokens": 1333319624.0, + "step": 4638 + }, + { + "epoch": 1.6521816562778273, + "grad_norm": 0.4398234188556671, + "learning_rate": 1e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7732751369476318, + "num_tokens": 1333644330.0, + "step": 4639 + }, + { + "epoch": 1.6525378450578807, + "grad_norm": 0.45153307914733887, + "learning_rate": 1e-06, + "loss": 0.6772, + "mean_token_accuracy": 0.7846260368824005, + "num_tokens": 1333935701.0, + "step": 4640 + }, + { + "epoch": 1.652894033837934, + "grad_norm": 0.50068598985672, + "learning_rate": 1e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7732563763856888, + "num_tokens": 1334205551.0, + "step": 4641 + }, + { + "epoch": 1.6532502226179875, + "grad_norm": 0.49492812156677246, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7343206256628036, + "num_tokens": 1334474169.0, + "step": 4642 + }, + { + "epoch": 1.6536064113980409, + "grad_norm": 0.42762675881385803, + "learning_rate": 1e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.767688125371933, + "num_tokens": 1334788853.0, + "step": 4643 + }, + { + "epoch": 1.6539626001780943, + "grad_norm": 0.5409451723098755, + "learning_rate": 1e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.75437131524086, + "num_tokens": 1335060202.0, + "step": 4644 + }, + { + "epoch": 1.6543187889581477, + "grad_norm": 0.5032457709312439, + "learning_rate": 1e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7552268952131271, + "num_tokens": 1335339192.0, + "step": 4645 + }, + { + "epoch": 1.6546749777382013, + "grad_norm": 0.490344375371933, + "learning_rate": 1e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7663947939872742, + "num_tokens": 1335619813.0, + "step": 4646 + }, + { + "epoch": 1.6550311665182547, + "grad_norm": 0.52678382396698, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7607219964265823, + "num_tokens": 1335882294.0, + "step": 4647 + }, + { + "epoch": 1.655387355298308, + "grad_norm": 0.5061575770378113, + "learning_rate": 1e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.770867794752121, + "num_tokens": 1336158929.0, + "step": 4648 + }, + { + "epoch": 1.6557435440783617, + "grad_norm": 0.47523608803749084, + "learning_rate": 1e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.7863334864377975, + "num_tokens": 1336448464.0, + "step": 4649 + }, + { + "epoch": 1.656099732858415, + "grad_norm": 0.4629923105239868, + "learning_rate": 1e-06, + "loss": 0.781, + "mean_token_accuracy": 0.7501084208488464, + "num_tokens": 1336722902.0, + "step": 4650 + }, + { + "epoch": 1.6564559216384684, + "grad_norm": 0.4436971843242645, + "learning_rate": 1e-06, + "loss": 0.6907, + "mean_token_accuracy": 0.7793568074703217, + "num_tokens": 1337044646.0, + "step": 4651 + }, + { + "epoch": 1.6568121104185218, + "grad_norm": 0.4310338795185089, + "learning_rate": 1e-06, + "loss": 0.6592, + "mean_token_accuracy": 0.782842680811882, + "num_tokens": 1337331283.0, + "step": 4652 + }, + { + "epoch": 1.6571682991985752, + "grad_norm": 0.4687049388885498, + "learning_rate": 1e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7665114551782608, + "num_tokens": 1337620255.0, + "step": 4653 + }, + { + "epoch": 1.6575244879786286, + "grad_norm": 0.5004373788833618, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7629900872707367, + "num_tokens": 1337914352.0, + "step": 4654 + }, + { + "epoch": 1.657880676758682, + "grad_norm": 0.49292850494384766, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7610797733068466, + "num_tokens": 1338188495.0, + "step": 4655 + }, + { + "epoch": 1.6582368655387354, + "grad_norm": 0.4659282863140106, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7668931782245636, + "num_tokens": 1338461160.0, + "step": 4656 + }, + { + "epoch": 1.658593054318789, + "grad_norm": 0.4290761351585388, + "learning_rate": 1e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.770206868648529, + "num_tokens": 1338783730.0, + "step": 4657 + }, + { + "epoch": 1.6589492430988424, + "grad_norm": 0.5053356885910034, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7847284823656082, + "num_tokens": 1339058723.0, + "step": 4658 + }, + { + "epoch": 1.6593054318788958, + "grad_norm": 0.482198029756546, + "learning_rate": 1e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7505245506763458, + "num_tokens": 1339334532.0, + "step": 4659 + }, + { + "epoch": 1.6596616206589494, + "grad_norm": 0.44708186388015747, + "learning_rate": 1e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7593084275722504, + "num_tokens": 1339617979.0, + "step": 4660 + }, + { + "epoch": 1.6600178094390028, + "grad_norm": 0.48198211193084717, + "learning_rate": 1e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7564798146486282, + "num_tokens": 1339924605.0, + "step": 4661 + }, + { + "epoch": 1.6603739982190562, + "grad_norm": 0.48350125551223755, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7723584920167923, + "num_tokens": 1340210522.0, + "step": 4662 + }, + { + "epoch": 1.6607301869991096, + "grad_norm": 0.4823504686355591, + "learning_rate": 1e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7753922492265701, + "num_tokens": 1340494819.0, + "step": 4663 + }, + { + "epoch": 1.661086375779163, + "grad_norm": 0.4888543486595154, + "learning_rate": 1e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7799984663724899, + "num_tokens": 1340783977.0, + "step": 4664 + }, + { + "epoch": 1.6614425645592164, + "grad_norm": 0.44722241163253784, + "learning_rate": 1e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7730536311864853, + "num_tokens": 1341061194.0, + "step": 4665 + }, + { + "epoch": 1.6617987533392697, + "grad_norm": 0.4814845621585846, + "learning_rate": 1e-06, + "loss": 0.658, + "mean_token_accuracy": 0.7822222262620926, + "num_tokens": 1341355946.0, + "step": 4666 + }, + { + "epoch": 1.6621549421193231, + "grad_norm": 0.47776883840560913, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7624336183071136, + "num_tokens": 1341646881.0, + "step": 4667 + }, + { + "epoch": 1.6625111308993765, + "grad_norm": 0.5091850161552429, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7687643319368362, + "num_tokens": 1341894821.0, + "step": 4668 + }, + { + "epoch": 1.6628673196794301, + "grad_norm": 0.5174984931945801, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.760602742433548, + "num_tokens": 1342163571.0, + "step": 4669 + }, + { + "epoch": 1.6632235084594835, + "grad_norm": 0.4292382001876831, + "learning_rate": 1e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7737176567316055, + "num_tokens": 1342476013.0, + "step": 4670 + }, + { + "epoch": 1.663579697239537, + "grad_norm": 0.4701586663722992, + "learning_rate": 1e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.756623163819313, + "num_tokens": 1342755353.0, + "step": 4671 + }, + { + "epoch": 1.6639358860195905, + "grad_norm": 0.47579458355903625, + "learning_rate": 1e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.7505943179130554, + "num_tokens": 1343038385.0, + "step": 4672 + }, + { + "epoch": 1.664292074799644, + "grad_norm": 0.4818507134914398, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.758762314915657, + "num_tokens": 1343343150.0, + "step": 4673 + }, + { + "epoch": 1.6646482635796973, + "grad_norm": 0.47091910243034363, + "learning_rate": 1e-06, + "loss": 0.7262, + "mean_token_accuracy": 0.7691228538751602, + "num_tokens": 1343638340.0, + "step": 4674 + }, + { + "epoch": 1.6650044523597507, + "grad_norm": 0.48576775193214417, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7556423097848892, + "num_tokens": 1343890507.0, + "step": 4675 + }, + { + "epoch": 1.665360641139804, + "grad_norm": 0.42050620913505554, + "learning_rate": 1e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7781517505645752, + "num_tokens": 1344165875.0, + "step": 4676 + }, + { + "epoch": 1.6657168299198575, + "grad_norm": 0.44180572032928467, + "learning_rate": 1e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7631212621927261, + "num_tokens": 1344481496.0, + "step": 4677 + }, + { + "epoch": 1.6660730186999109, + "grad_norm": 0.5359728932380676, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7705101817846298, + "num_tokens": 1344747517.0, + "step": 4678 + }, + { + "epoch": 1.6664292074799643, + "grad_norm": 0.48363354802131653, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7587077766656876, + "num_tokens": 1345029063.0, + "step": 4679 + }, + { + "epoch": 1.6667853962600176, + "grad_norm": 0.4725097715854645, + "learning_rate": 1e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7518695890903473, + "num_tokens": 1345342465.0, + "step": 4680 + }, + { + "epoch": 1.6671415850400713, + "grad_norm": 0.4435693919658661, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7620205730199814, + "num_tokens": 1345661179.0, + "step": 4681 + }, + { + "epoch": 1.6674977738201247, + "grad_norm": 0.49250155687332153, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7615340203046799, + "num_tokens": 1345978432.0, + "step": 4682 + }, + { + "epoch": 1.667853962600178, + "grad_norm": 0.4840758740901947, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7637151777744293, + "num_tokens": 1346279785.0, + "step": 4683 + }, + { + "epoch": 1.6682101513802317, + "grad_norm": 0.4347550868988037, + "learning_rate": 1e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7674322724342346, + "num_tokens": 1346598421.0, + "step": 4684 + }, + { + "epoch": 1.668566340160285, + "grad_norm": 0.4727330207824707, + "learning_rate": 1e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7734518349170685, + "num_tokens": 1346891444.0, + "step": 4685 + }, + { + "epoch": 1.6689225289403384, + "grad_norm": 0.43972647190093994, + "learning_rate": 1e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7707450836896896, + "num_tokens": 1347237937.0, + "step": 4686 + }, + { + "epoch": 1.6692787177203918, + "grad_norm": 0.4552144706249237, + "learning_rate": 1e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7561701238155365, + "num_tokens": 1347515583.0, + "step": 4687 + }, + { + "epoch": 1.6696349065004452, + "grad_norm": 0.4900066554546356, + "learning_rate": 1e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.7705812156200409, + "num_tokens": 1347803089.0, + "step": 4688 + }, + { + "epoch": 1.6699910952804986, + "grad_norm": 0.5178512334823608, + "learning_rate": 1e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.7769374698400497, + "num_tokens": 1348091405.0, + "step": 4689 + }, + { + "epoch": 1.670347284060552, + "grad_norm": 0.5129429697990417, + "learning_rate": 1e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7596686780452728, + "num_tokens": 1348353572.0, + "step": 4690 + }, + { + "epoch": 1.6707034728406054, + "grad_norm": 0.45011284947395325, + "learning_rate": 1e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.7676748037338257, + "num_tokens": 1348645744.0, + "step": 4691 + }, + { + "epoch": 1.671059661620659, + "grad_norm": 0.444328635931015, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7694458812475204, + "num_tokens": 1348932168.0, + "step": 4692 + }, + { + "epoch": 1.6714158504007124, + "grad_norm": 0.4647696912288666, + "learning_rate": 1e-06, + "loss": 0.802, + "mean_token_accuracy": 0.749552309513092, + "num_tokens": 1349239805.0, + "step": 4693 + }, + { + "epoch": 1.6717720391807658, + "grad_norm": 0.4466795027256012, + "learning_rate": 1e-06, + "loss": 0.6873, + "mean_token_accuracy": 0.7803139984607697, + "num_tokens": 1349514223.0, + "step": 4694 + }, + { + "epoch": 1.6721282279608194, + "grad_norm": 0.49745431542396545, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7497501522302628, + "num_tokens": 1349778428.0, + "step": 4695 + }, + { + "epoch": 1.6724844167408728, + "grad_norm": 0.47609743475914, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.763004332780838, + "num_tokens": 1350055682.0, + "step": 4696 + }, + { + "epoch": 1.6728406055209262, + "grad_norm": 0.49005231261253357, + "learning_rate": 1e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7565577030181885, + "num_tokens": 1350344014.0, + "step": 4697 + }, + { + "epoch": 1.6731967943009796, + "grad_norm": 0.4579489529132843, + "learning_rate": 1e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7726770043373108, + "num_tokens": 1350633324.0, + "step": 4698 + }, + { + "epoch": 1.673552983081033, + "grad_norm": 0.4447023868560791, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7640009522438049, + "num_tokens": 1350950667.0, + "step": 4699 + }, + { + "epoch": 1.6739091718610863, + "grad_norm": 0.4814574420452118, + "learning_rate": 1e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.777065321803093, + "num_tokens": 1351224449.0, + "step": 4700 + }, + { + "epoch": 1.6742653606411397, + "grad_norm": 0.4956473112106323, + "learning_rate": 1e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7645461708307266, + "num_tokens": 1351500646.0, + "step": 4701 + }, + { + "epoch": 1.6746215494211931, + "grad_norm": 0.4962291717529297, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7715302258729935, + "num_tokens": 1351769899.0, + "step": 4702 + }, + { + "epoch": 1.6749777382012465, + "grad_norm": 0.48595190048217773, + "learning_rate": 1e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7477353066205978, + "num_tokens": 1352074007.0, + "step": 4703 + }, + { + "epoch": 1.6753339269813001, + "grad_norm": 0.48599380254745483, + "learning_rate": 1e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.759526714682579, + "num_tokens": 1352330719.0, + "step": 4704 + }, + { + "epoch": 1.6756901157613535, + "grad_norm": 0.4875049293041229, + "learning_rate": 1e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7560532987117767, + "num_tokens": 1352598388.0, + "step": 4705 + }, + { + "epoch": 1.676046304541407, + "grad_norm": 0.46411556005477905, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7611210495233536, + "num_tokens": 1352875200.0, + "step": 4706 + }, + { + "epoch": 1.6764024933214605, + "grad_norm": 0.47884228825569153, + "learning_rate": 1e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.7746733129024506, + "num_tokens": 1353167628.0, + "step": 4707 + }, + { + "epoch": 1.676758682101514, + "grad_norm": 0.494938462972641, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7656295001506805, + "num_tokens": 1353453597.0, + "step": 4708 + }, + { + "epoch": 1.6771148708815673, + "grad_norm": 0.4474284052848816, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7649738192558289, + "num_tokens": 1353763755.0, + "step": 4709 + }, + { + "epoch": 1.6774710596616207, + "grad_norm": 0.4333096444606781, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7578388452529907, + "num_tokens": 1354102616.0, + "step": 4710 + }, + { + "epoch": 1.677827248441674, + "grad_norm": 0.4423416256904602, + "learning_rate": 1e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7770654112100601, + "num_tokens": 1354376219.0, + "step": 4711 + }, + { + "epoch": 1.6781834372217275, + "grad_norm": 0.5044372081756592, + "learning_rate": 1e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7665127366781235, + "num_tokens": 1354663525.0, + "step": 4712 + }, + { + "epoch": 1.6785396260017809, + "grad_norm": 0.42836952209472656, + "learning_rate": 1e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7694197744131088, + "num_tokens": 1354987851.0, + "step": 4713 + }, + { + "epoch": 1.6788958147818343, + "grad_norm": 0.44734296202659607, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7564234137535095, + "num_tokens": 1355281446.0, + "step": 4714 + }, + { + "epoch": 1.6792520035618876, + "grad_norm": 0.45892253518104553, + "learning_rate": 1e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.7791266292333603, + "num_tokens": 1355568548.0, + "step": 4715 + }, + { + "epoch": 1.6796081923419413, + "grad_norm": 0.45663994550704956, + "learning_rate": 1e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7586779296398163, + "num_tokens": 1355864848.0, + "step": 4716 + }, + { + "epoch": 1.6799643811219946, + "grad_norm": 0.4827677309513092, + "learning_rate": 1e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.762845441699028, + "num_tokens": 1356152963.0, + "step": 4717 + }, + { + "epoch": 1.680320569902048, + "grad_norm": 0.4512958824634552, + "learning_rate": 1e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.760385662317276, + "num_tokens": 1356471491.0, + "step": 4718 + }, + { + "epoch": 1.6806767586821016, + "grad_norm": 0.474211722612381, + "learning_rate": 1e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.7567976266145706, + "num_tokens": 1356741055.0, + "step": 4719 + }, + { + "epoch": 1.681032947462155, + "grad_norm": 0.4722476303577423, + "learning_rate": 1e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7735118120908737, + "num_tokens": 1357002654.0, + "step": 4720 + }, + { + "epoch": 1.6813891362422084, + "grad_norm": 0.49419963359832764, + "learning_rate": 1e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7617388665676117, + "num_tokens": 1357264653.0, + "step": 4721 + }, + { + "epoch": 1.6817453250222618, + "grad_norm": 0.46440640091896057, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7752531319856644, + "num_tokens": 1357559485.0, + "step": 4722 + }, + { + "epoch": 1.6821015138023152, + "grad_norm": 0.49688705801963806, + "learning_rate": 1e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7600838541984558, + "num_tokens": 1357841664.0, + "step": 4723 + }, + { + "epoch": 1.6824577025823686, + "grad_norm": 0.5126222372055054, + "learning_rate": 1e-06, + "loss": 0.7895, + "mean_token_accuracy": 0.7511759251356125, + "num_tokens": 1358105079.0, + "step": 4724 + }, + { + "epoch": 1.682813891362422, + "grad_norm": 0.4797888994216919, + "learning_rate": 1e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.7718266546726227, + "num_tokens": 1358399338.0, + "step": 4725 + }, + { + "epoch": 1.6831700801424754, + "grad_norm": 0.48174935579299927, + "learning_rate": 1e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7585674524307251, + "num_tokens": 1358693769.0, + "step": 4726 + }, + { + "epoch": 1.683526268922529, + "grad_norm": 0.46926113963127136, + "learning_rate": 1e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7700988501310349, + "num_tokens": 1358998711.0, + "step": 4727 + }, + { + "epoch": 1.6838824577025824, + "grad_norm": 0.44658949971199036, + "learning_rate": 1e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7663963139057159, + "num_tokens": 1359289438.0, + "step": 4728 + }, + { + "epoch": 1.6842386464826358, + "grad_norm": 0.4689638018608093, + "learning_rate": 1e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7610277384519577, + "num_tokens": 1359585121.0, + "step": 4729 + }, + { + "epoch": 1.6845948352626894, + "grad_norm": 0.4610441327095032, + "learning_rate": 1e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7732073813676834, + "num_tokens": 1359898690.0, + "step": 4730 + }, + { + "epoch": 1.6849510240427428, + "grad_norm": 0.44104355573654175, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7740188390016556, + "num_tokens": 1360202523.0, + "step": 4731 + }, + { + "epoch": 1.6853072128227962, + "grad_norm": 0.4343109726905823, + "learning_rate": 1e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7527921050786972, + "num_tokens": 1360526366.0, + "step": 4732 + }, + { + "epoch": 1.6856634016028496, + "grad_norm": 0.45775270462036133, + "learning_rate": 1e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7730297148227692, + "num_tokens": 1360824110.0, + "step": 4733 + }, + { + "epoch": 1.686019590382903, + "grad_norm": 0.48201826214790344, + "learning_rate": 1e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7745275944471359, + "num_tokens": 1361096791.0, + "step": 4734 + }, + { + "epoch": 1.6863757791629563, + "grad_norm": 0.49265387654304504, + "learning_rate": 1e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7516181766986847, + "num_tokens": 1361370272.0, + "step": 4735 + }, + { + "epoch": 1.6867319679430097, + "grad_norm": 0.4811987280845642, + "learning_rate": 1e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7731754630804062, + "num_tokens": 1361639789.0, + "step": 4736 + }, + { + "epoch": 1.6870881567230631, + "grad_norm": 0.5306575894355774, + "learning_rate": 1e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.7581555247306824, + "num_tokens": 1361900468.0, + "step": 4737 + }, + { + "epoch": 1.6874443455031165, + "grad_norm": 0.47339415550231934, + "learning_rate": 1e-06, + "loss": 0.72, + "mean_token_accuracy": 0.7778217941522598, + "num_tokens": 1362193271.0, + "step": 4738 + }, + { + "epoch": 1.6878005342831701, + "grad_norm": 0.4655230641365051, + "learning_rate": 1e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7705656141042709, + "num_tokens": 1362497771.0, + "step": 4739 + }, + { + "epoch": 1.6881567230632235, + "grad_norm": 0.44868192076683044, + "learning_rate": 1e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7572232186794281, + "num_tokens": 1362773220.0, + "step": 4740 + }, + { + "epoch": 1.688512911843277, + "grad_norm": 0.44754758477211, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7571342438459396, + "num_tokens": 1363061221.0, + "step": 4741 + }, + { + "epoch": 1.6888691006233305, + "grad_norm": 0.4338969886302948, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7680266052484512, + "num_tokens": 1363347778.0, + "step": 4742 + }, + { + "epoch": 1.689225289403384, + "grad_norm": 0.5081873536109924, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7610637843608856, + "num_tokens": 1363625620.0, + "step": 4743 + }, + { + "epoch": 1.6895814781834373, + "grad_norm": 0.4607025980949402, + "learning_rate": 1e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.7570861876010895, + "num_tokens": 1363928569.0, + "step": 4744 + }, + { + "epoch": 1.6899376669634907, + "grad_norm": 0.49234738945961, + "learning_rate": 1e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7576070874929428, + "num_tokens": 1364213822.0, + "step": 4745 + }, + { + "epoch": 1.690293855743544, + "grad_norm": 0.4556550681591034, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7689715772867203, + "num_tokens": 1364473140.0, + "step": 4746 + }, + { + "epoch": 1.6906500445235975, + "grad_norm": 0.467572957277298, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7622601240873337, + "num_tokens": 1364768565.0, + "step": 4747 + }, + { + "epoch": 1.6910062333036509, + "grad_norm": 0.4902714192867279, + "learning_rate": 1e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.7894264608621597, + "num_tokens": 1365053070.0, + "step": 4748 + }, + { + "epoch": 1.6913624220837042, + "grad_norm": 0.4101731479167938, + "learning_rate": 1e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7600946724414825, + "num_tokens": 1365374485.0, + "step": 4749 + }, + { + "epoch": 1.6917186108637576, + "grad_norm": 0.4421333074569702, + "learning_rate": 1e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7758052349090576, + "num_tokens": 1365698650.0, + "step": 4750 + }, + { + "epoch": 1.6920747996438112, + "grad_norm": 0.5050173997879028, + "learning_rate": 1e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.7745634168386459, + "num_tokens": 1365976368.0, + "step": 4751 + }, + { + "epoch": 1.6924309884238646, + "grad_norm": 0.491838276386261, + "learning_rate": 1e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7649306803941727, + "num_tokens": 1366228473.0, + "step": 4752 + }, + { + "epoch": 1.692787177203918, + "grad_norm": 0.5060285925865173, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7352010011672974, + "num_tokens": 1366473321.0, + "step": 4753 + }, + { + "epoch": 1.6931433659839716, + "grad_norm": 0.40355384349823, + "learning_rate": 1e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7576362490653992, + "num_tokens": 1366791820.0, + "step": 4754 + }, + { + "epoch": 1.693499554764025, + "grad_norm": 0.5188241600990295, + "learning_rate": 1e-06, + "loss": 0.7895, + "mean_token_accuracy": 0.7447880804538727, + "num_tokens": 1367079332.0, + "step": 4755 + }, + { + "epoch": 1.6938557435440784, + "grad_norm": 0.39948442578315735, + "learning_rate": 1e-06, + "loss": 0.6693, + "mean_token_accuracy": 0.7917936891317368, + "num_tokens": 1367383823.0, + "step": 4756 + }, + { + "epoch": 1.6942119323241318, + "grad_norm": 0.4659767746925354, + "learning_rate": 1e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7697001546621323, + "num_tokens": 1367688013.0, + "step": 4757 + }, + { + "epoch": 1.6945681211041852, + "grad_norm": 0.47364744544029236, + "learning_rate": 1e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7703391462564468, + "num_tokens": 1367981953.0, + "step": 4758 + }, + { + "epoch": 1.6949243098842386, + "grad_norm": 0.4905956983566284, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7549118250608444, + "num_tokens": 1368257962.0, + "step": 4759 + }, + { + "epoch": 1.695280498664292, + "grad_norm": 0.439567506313324, + "learning_rate": 1e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.756079688668251, + "num_tokens": 1368551558.0, + "step": 4760 + }, + { + "epoch": 1.6956366874443454, + "grad_norm": 0.4402157962322235, + "learning_rate": 1e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.7672078162431717, + "num_tokens": 1368870198.0, + "step": 4761 + }, + { + "epoch": 1.695992876224399, + "grad_norm": 0.4513395130634308, + "learning_rate": 1e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.748332068324089, + "num_tokens": 1369149293.0, + "step": 4762 + }, + { + "epoch": 1.6963490650044524, + "grad_norm": 0.44131141901016235, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7625006884336472, + "num_tokens": 1369459939.0, + "step": 4763 + }, + { + "epoch": 1.6967052537845058, + "grad_norm": 0.5024852156639099, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7618293613195419, + "num_tokens": 1369726954.0, + "step": 4764 + }, + { + "epoch": 1.6970614425645594, + "grad_norm": 0.4366608262062073, + "learning_rate": 1e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7685694694519043, + "num_tokens": 1370038702.0, + "step": 4765 + }, + { + "epoch": 1.6974176313446128, + "grad_norm": 0.46261611580848694, + "learning_rate": 1e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7715583592653275, + "num_tokens": 1370335845.0, + "step": 4766 + }, + { + "epoch": 1.6977738201246662, + "grad_norm": 0.47350192070007324, + "learning_rate": 1e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.7791083753108978, + "num_tokens": 1370650256.0, + "step": 4767 + }, + { + "epoch": 1.6981300089047195, + "grad_norm": 0.4479943811893463, + "learning_rate": 1e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7726662307977676, + "num_tokens": 1370938392.0, + "step": 4768 + }, + { + "epoch": 1.698486197684773, + "grad_norm": 0.42715802788734436, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7703132182359695, + "num_tokens": 1371229795.0, + "step": 4769 + }, + { + "epoch": 1.6988423864648263, + "grad_norm": 0.47598567605018616, + "learning_rate": 1e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7477836608886719, + "num_tokens": 1371502328.0, + "step": 4770 + }, + { + "epoch": 1.6991985752448797, + "grad_norm": 0.42889726161956787, + "learning_rate": 1e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7723706364631653, + "num_tokens": 1371795534.0, + "step": 4771 + }, + { + "epoch": 1.699554764024933, + "grad_norm": 0.45842206478118896, + "learning_rate": 1e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.783513069152832, + "num_tokens": 1372097492.0, + "step": 4772 + }, + { + "epoch": 1.6999109528049865, + "grad_norm": 0.4613652527332306, + "learning_rate": 1e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7761686891317368, + "num_tokens": 1372364086.0, + "step": 4773 + }, + { + "epoch": 1.70026714158504, + "grad_norm": 0.41913366317749023, + "learning_rate": 1e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7714518755674362, + "num_tokens": 1372663029.0, + "step": 4774 + }, + { + "epoch": 1.7006233303650935, + "grad_norm": 0.5255366563796997, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7627631723880768, + "num_tokens": 1372971524.0, + "step": 4775 + }, + { + "epoch": 1.700979519145147, + "grad_norm": 0.45896607637405396, + "learning_rate": 1e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7796875685453415, + "num_tokens": 1373286486.0, + "step": 4776 + }, + { + "epoch": 1.7013357079252005, + "grad_norm": 0.4681069552898407, + "learning_rate": 1e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7570318728685379, + "num_tokens": 1373552936.0, + "step": 4777 + }, + { + "epoch": 1.701691896705254, + "grad_norm": 0.5683605670928955, + "learning_rate": 1e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.779165655374527, + "num_tokens": 1373823309.0, + "step": 4778 + }, + { + "epoch": 1.7020480854853073, + "grad_norm": 0.5447792410850525, + "learning_rate": 1e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7387970834970474, + "num_tokens": 1374109297.0, + "step": 4779 + }, + { + "epoch": 1.7024042742653607, + "grad_norm": 0.4536997079849243, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7581424713134766, + "num_tokens": 1374404236.0, + "step": 4780 + }, + { + "epoch": 1.702760463045414, + "grad_norm": 0.5142472386360168, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7597175985574722, + "num_tokens": 1374652167.0, + "step": 4781 + }, + { + "epoch": 1.7031166518254675, + "grad_norm": 0.4716418981552124, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.767267495393753, + "num_tokens": 1374943809.0, + "step": 4782 + }, + { + "epoch": 1.7034728406055208, + "grad_norm": 0.5015986561775208, + "learning_rate": 1e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7613560408353806, + "num_tokens": 1375229171.0, + "step": 4783 + }, + { + "epoch": 1.7038290293855742, + "grad_norm": 0.4879612922668457, + "learning_rate": 1e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7599732726812363, + "num_tokens": 1375499194.0, + "step": 4784 + }, + { + "epoch": 1.7041852181656276, + "grad_norm": 0.44151732325553894, + "learning_rate": 1e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7722752839326859, + "num_tokens": 1375803486.0, + "step": 4785 + }, + { + "epoch": 1.7045414069456812, + "grad_norm": 0.5306690335273743, + "learning_rate": 1e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7648730874061584, + "num_tokens": 1376075366.0, + "step": 4786 + }, + { + "epoch": 1.7048975957257346, + "grad_norm": 0.45505502820014954, + "learning_rate": 1e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7685636878013611, + "num_tokens": 1376390936.0, + "step": 4787 + }, + { + "epoch": 1.705253784505788, + "grad_norm": 0.43856099247932434, + "learning_rate": 1e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7779200971126556, + "num_tokens": 1376687849.0, + "step": 4788 + }, + { + "epoch": 1.7056099732858416, + "grad_norm": 0.5076470375061035, + "learning_rate": 1e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.783205658197403, + "num_tokens": 1376953140.0, + "step": 4789 + }, + { + "epoch": 1.705966162065895, + "grad_norm": 0.4638603925704956, + "learning_rate": 1e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.7810916006565094, + "num_tokens": 1377237337.0, + "step": 4790 + }, + { + "epoch": 1.7063223508459484, + "grad_norm": 0.45854175090789795, + "learning_rate": 1e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7612675875425339, + "num_tokens": 1377540081.0, + "step": 4791 + }, + { + "epoch": 1.7066785396260018, + "grad_norm": 0.43137645721435547, + "learning_rate": 1e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.755291149020195, + "num_tokens": 1377833183.0, + "step": 4792 + }, + { + "epoch": 1.7070347284060552, + "grad_norm": 0.4777851104736328, + "learning_rate": 1e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.762516051530838, + "num_tokens": 1378101764.0, + "step": 4793 + }, + { + "epoch": 1.7073909171861086, + "grad_norm": 0.41777682304382324, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7684090733528137, + "num_tokens": 1378423297.0, + "step": 4794 + }, + { + "epoch": 1.707747105966162, + "grad_norm": 0.45723792910575867, + "learning_rate": 1e-06, + "loss": 0.6647, + "mean_token_accuracy": 0.7881250977516174, + "num_tokens": 1378722417.0, + "step": 4795 + }, + { + "epoch": 1.7081032947462154, + "grad_norm": 0.4846177399158478, + "learning_rate": 1e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7658776044845581, + "num_tokens": 1379019019.0, + "step": 4796 + }, + { + "epoch": 1.708459483526269, + "grad_norm": 0.5127087235450745, + "learning_rate": 1e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.761937603354454, + "num_tokens": 1379283382.0, + "step": 4797 + }, + { + "epoch": 1.7088156723063224, + "grad_norm": 0.47467976808547974, + "learning_rate": 1e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7493857741355896, + "num_tokens": 1379550643.0, + "step": 4798 + }, + { + "epoch": 1.7091718610863758, + "grad_norm": 0.496336430311203, + "learning_rate": 1e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7583719938993454, + "num_tokens": 1379820790.0, + "step": 4799 + }, + { + "epoch": 1.7095280498664294, + "grad_norm": 0.47758743166923523, + "learning_rate": 1e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7548301964998245, + "num_tokens": 1380090006.0, + "step": 4800 + }, + { + "epoch": 1.7098842386464828, + "grad_norm": 0.5580882430076599, + "learning_rate": 1e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7665704935789108, + "num_tokens": 1380345529.0, + "step": 4801 + }, + { + "epoch": 1.7102404274265361, + "grad_norm": 0.4636550545692444, + "learning_rate": 1e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.77329221367836, + "num_tokens": 1380638952.0, + "step": 4802 + }, + { + "epoch": 1.7105966162065895, + "grad_norm": 0.49564772844314575, + "learning_rate": 1e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7748498916625977, + "num_tokens": 1380920547.0, + "step": 4803 + }, + { + "epoch": 1.710952804986643, + "grad_norm": 0.4193379580974579, + "learning_rate": 1e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7840025871992111, + "num_tokens": 1381251508.0, + "step": 4804 + }, + { + "epoch": 1.7113089937666963, + "grad_norm": 0.520034909248352, + "learning_rate": 1e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7744692265987396, + "num_tokens": 1381533052.0, + "step": 4805 + }, + { + "epoch": 1.7116651825467497, + "grad_norm": 0.4715181291103363, + "learning_rate": 1e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7711933702230453, + "num_tokens": 1381814880.0, + "step": 4806 + }, + { + "epoch": 1.712021371326803, + "grad_norm": 0.4762163758277893, + "learning_rate": 1e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7768588215112686, + "num_tokens": 1382096641.0, + "step": 4807 + }, + { + "epoch": 1.7123775601068565, + "grad_norm": 0.49084949493408203, + "learning_rate": 1e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7607933878898621, + "num_tokens": 1382360831.0, + "step": 4808 + }, + { + "epoch": 1.71273374888691, + "grad_norm": 0.42312538623809814, + "learning_rate": 1e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.7828419208526611, + "num_tokens": 1382663349.0, + "step": 4809 + }, + { + "epoch": 1.7130899376669635, + "grad_norm": 0.4910227656364441, + "learning_rate": 1e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7477255016565323, + "num_tokens": 1382963127.0, + "step": 4810 + }, + { + "epoch": 1.7134461264470169, + "grad_norm": 0.49566522240638733, + "learning_rate": 1e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7687232494354248, + "num_tokens": 1383246608.0, + "step": 4811 + }, + { + "epoch": 1.7138023152270705, + "grad_norm": 0.48580870032310486, + "learning_rate": 1e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.7830298990011215, + "num_tokens": 1383525629.0, + "step": 4812 + }, + { + "epoch": 1.7141585040071239, + "grad_norm": 0.4768140912055969, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7773083448410034, + "num_tokens": 1383826084.0, + "step": 4813 + }, + { + "epoch": 1.7145146927871773, + "grad_norm": 0.4611629843711853, + "learning_rate": 1e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7578984200954437, + "num_tokens": 1384110004.0, + "step": 4814 + }, + { + "epoch": 1.7148708815672307, + "grad_norm": 0.43140631914138794, + "learning_rate": 1e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7743200808763504, + "num_tokens": 1384408136.0, + "step": 4815 + }, + { + "epoch": 1.715227070347284, + "grad_norm": 0.48092398047447205, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7496209442615509, + "num_tokens": 1384677969.0, + "step": 4816 + }, + { + "epoch": 1.7155832591273374, + "grad_norm": 0.5233674049377441, + "learning_rate": 1e-06, + "loss": 0.655, + "mean_token_accuracy": 0.786603718996048, + "num_tokens": 1384955080.0, + "step": 4817 + }, + { + "epoch": 1.7159394479073908, + "grad_norm": 0.4914408326148987, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7662863284349442, + "num_tokens": 1385238146.0, + "step": 4818 + }, + { + "epoch": 1.7162956366874442, + "grad_norm": 0.4516421854496002, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7705629467964172, + "num_tokens": 1385527773.0, + "step": 4819 + }, + { + "epoch": 1.7166518254674976, + "grad_norm": 0.4452803134918213, + "learning_rate": 1e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.7507653832435608, + "num_tokens": 1385821680.0, + "step": 4820 + }, + { + "epoch": 1.7170080142475512, + "grad_norm": 0.4435681700706482, + "learning_rate": 1e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.750594437122345, + "num_tokens": 1386138976.0, + "step": 4821 + }, + { + "epoch": 1.7173642030276046, + "grad_norm": 0.48399919271469116, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7396445572376251, + "num_tokens": 1386397074.0, + "step": 4822 + }, + { + "epoch": 1.717720391807658, + "grad_norm": 0.5251403450965881, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7518317401409149, + "num_tokens": 1386666673.0, + "step": 4823 + }, + { + "epoch": 1.7180765805877116, + "grad_norm": 0.4324668347835541, + "learning_rate": 1e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7638386934995651, + "num_tokens": 1386986605.0, + "step": 4824 + }, + { + "epoch": 1.718432769367765, + "grad_norm": 0.47874921560287476, + "learning_rate": 1e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7597150504589081, + "num_tokens": 1387294267.0, + "step": 4825 + }, + { + "epoch": 1.7187889581478184, + "grad_norm": 0.44104909896850586, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7498998492956161, + "num_tokens": 1387593648.0, + "step": 4826 + }, + { + "epoch": 1.7191451469278718, + "grad_norm": 0.4470613896846771, + "learning_rate": 1e-06, + "loss": 0.7952, + "mean_token_accuracy": 0.7522409111261368, + "num_tokens": 1387900649.0, + "step": 4827 + }, + { + "epoch": 1.7195013357079252, + "grad_norm": 0.4791065752506256, + "learning_rate": 1e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.7674103677272797, + "num_tokens": 1388173688.0, + "step": 4828 + }, + { + "epoch": 1.7198575244879786, + "grad_norm": 0.4939381778240204, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7704061567783356, + "num_tokens": 1388440524.0, + "step": 4829 + }, + { + "epoch": 1.720213713268032, + "grad_norm": 0.4918767511844635, + "learning_rate": 1e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7730076611042023, + "num_tokens": 1388723687.0, + "step": 4830 + }, + { + "epoch": 1.7205699020480854, + "grad_norm": 0.5318161249160767, + "learning_rate": 1e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7606925815343857, + "num_tokens": 1388963840.0, + "step": 4831 + }, + { + "epoch": 1.720926090828139, + "grad_norm": 0.45552849769592285, + "learning_rate": 1e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7708662301301956, + "num_tokens": 1389259487.0, + "step": 4832 + }, + { + "epoch": 1.7212822796081924, + "grad_norm": 0.4714120924472809, + "learning_rate": 1e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7726870030164719, + "num_tokens": 1389528410.0, + "step": 4833 + }, + { + "epoch": 1.7216384683882457, + "grad_norm": 0.47453778982162476, + "learning_rate": 1e-06, + "loss": 0.7304, + "mean_token_accuracy": 0.768134206533432, + "num_tokens": 1389821541.0, + "step": 4834 + }, + { + "epoch": 1.7219946571682994, + "grad_norm": 0.38732874393463135, + "learning_rate": 1e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.7748133093118668, + "num_tokens": 1390174439.0, + "step": 4835 + }, + { + "epoch": 1.7223508459483527, + "grad_norm": 0.47067856788635254, + "learning_rate": 1e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7754658162593842, + "num_tokens": 1390463023.0, + "step": 4836 + }, + { + "epoch": 1.7227070347284061, + "grad_norm": 0.523944616317749, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7708628177642822, + "num_tokens": 1390707074.0, + "step": 4837 + }, + { + "epoch": 1.7230632235084595, + "grad_norm": 0.49477648735046387, + "learning_rate": 1e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.7756939977407455, + "num_tokens": 1390982346.0, + "step": 4838 + }, + { + "epoch": 1.723419412288513, + "grad_norm": 0.44837239384651184, + "learning_rate": 1e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7539081126451492, + "num_tokens": 1391303257.0, + "step": 4839 + }, + { + "epoch": 1.7237756010685663, + "grad_norm": 0.4949322044849396, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7661320567131042, + "num_tokens": 1391604427.0, + "step": 4840 + }, + { + "epoch": 1.7241317898486197, + "grad_norm": 0.42945870757102966, + "learning_rate": 1e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.7694298774003983, + "num_tokens": 1391933431.0, + "step": 4841 + }, + { + "epoch": 1.724487978628673, + "grad_norm": 0.5096124410629272, + "learning_rate": 1e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7706911116838455, + "num_tokens": 1392225196.0, + "step": 4842 + }, + { + "epoch": 1.7248441674087265, + "grad_norm": 0.4581123888492584, + "learning_rate": 1e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7764961421489716, + "num_tokens": 1392518158.0, + "step": 4843 + }, + { + "epoch": 1.72520035618878, + "grad_norm": 0.4893372654914856, + "learning_rate": 1e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7696798890829086, + "num_tokens": 1392775822.0, + "step": 4844 + }, + { + "epoch": 1.7255565449688335, + "grad_norm": 0.49664419889450073, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7557047605514526, + "num_tokens": 1393063602.0, + "step": 4845 + }, + { + "epoch": 1.7259127337488869, + "grad_norm": 0.4407581090927124, + "learning_rate": 1e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.765486404299736, + "num_tokens": 1393332421.0, + "step": 4846 + }, + { + "epoch": 1.7262689225289405, + "grad_norm": 0.43980279564857483, + "learning_rate": 1e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.759210079908371, + "num_tokens": 1393617105.0, + "step": 4847 + }, + { + "epoch": 1.7266251113089939, + "grad_norm": 0.4517846405506134, + "learning_rate": 1e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.7773745208978653, + "num_tokens": 1393940526.0, + "step": 4848 + }, + { + "epoch": 1.7269813000890473, + "grad_norm": 0.45673418045043945, + "learning_rate": 1e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.7804945260286331, + "num_tokens": 1394268602.0, + "step": 4849 + }, + { + "epoch": 1.7273374888691007, + "grad_norm": 0.44941791892051697, + "learning_rate": 1e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7709080725908279, + "num_tokens": 1394566852.0, + "step": 4850 + }, + { + "epoch": 1.727693677649154, + "grad_norm": 0.46989527344703674, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7667247354984283, + "num_tokens": 1394861312.0, + "step": 4851 + }, + { + "epoch": 1.7280498664292074, + "grad_norm": 0.4513716697692871, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7720401138067245, + "num_tokens": 1395149521.0, + "step": 4852 + }, + { + "epoch": 1.7284060552092608, + "grad_norm": 0.4645635783672333, + "learning_rate": 1e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.746340349316597, + "num_tokens": 1395450946.0, + "step": 4853 + }, + { + "epoch": 1.7287622439893142, + "grad_norm": 0.5153348445892334, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.752252072095871, + "num_tokens": 1395709459.0, + "step": 4854 + }, + { + "epoch": 1.7291184327693676, + "grad_norm": 0.4505300521850586, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7675778269767761, + "num_tokens": 1395982330.0, + "step": 4855 + }, + { + "epoch": 1.7294746215494212, + "grad_norm": 0.4617336094379425, + "learning_rate": 1e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7591465711593628, + "num_tokens": 1396258774.0, + "step": 4856 + }, + { + "epoch": 1.7298308103294746, + "grad_norm": 0.5002415776252747, + "learning_rate": 1e-06, + "loss": 0.7886, + "mean_token_accuracy": 0.7483216375112534, + "num_tokens": 1396515036.0, + "step": 4857 + }, + { + "epoch": 1.730186999109528, + "grad_norm": 0.46407851576805115, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7706488370895386, + "num_tokens": 1396820912.0, + "step": 4858 + }, + { + "epoch": 1.7305431878895816, + "grad_norm": 0.4874577522277832, + "learning_rate": 1e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7633466273546219, + "num_tokens": 1397074981.0, + "step": 4859 + }, + { + "epoch": 1.730899376669635, + "grad_norm": 0.49870648980140686, + "learning_rate": 1e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7505944818258286, + "num_tokens": 1397364152.0, + "step": 4860 + }, + { + "epoch": 1.7312555654496884, + "grad_norm": 0.4896503984928131, + "learning_rate": 1e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7608576565980911, + "num_tokens": 1397666177.0, + "step": 4861 + }, + { + "epoch": 1.7316117542297418, + "grad_norm": 0.4661373198032379, + "learning_rate": 1e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7705289721488953, + "num_tokens": 1397981911.0, + "step": 4862 + }, + { + "epoch": 1.7319679430097952, + "grad_norm": 0.4934154450893402, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.763990044593811, + "num_tokens": 1398280801.0, + "step": 4863 + }, + { + "epoch": 1.7323241317898486, + "grad_norm": 0.4236477017402649, + "learning_rate": 1e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.7784427404403687, + "num_tokens": 1398599055.0, + "step": 4864 + }, + { + "epoch": 1.732680320569902, + "grad_norm": 0.47938767075538635, + "learning_rate": 1e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7643117010593414, + "num_tokens": 1398909492.0, + "step": 4865 + }, + { + "epoch": 1.7330365093499553, + "grad_norm": 0.44228994846343994, + "learning_rate": 1e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7791278064250946, + "num_tokens": 1399214863.0, + "step": 4866 + }, + { + "epoch": 1.733392698130009, + "grad_norm": 0.4975748062133789, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7621459066867828, + "num_tokens": 1399482452.0, + "step": 4867 + }, + { + "epoch": 1.7337488869100623, + "grad_norm": 0.46216654777526855, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.746118038892746, + "num_tokens": 1399773736.0, + "step": 4868 + }, + { + "epoch": 1.7341050756901157, + "grad_norm": 0.4821513295173645, + "learning_rate": 1e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.7679144889116287, + "num_tokens": 1400033098.0, + "step": 4869 + }, + { + "epoch": 1.7344612644701694, + "grad_norm": 0.5142987966537476, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7331906259059906, + "num_tokens": 1400308248.0, + "step": 4870 + }, + { + "epoch": 1.7348174532502227, + "grad_norm": 0.4368300139904022, + "learning_rate": 1e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.7529932707548141, + "num_tokens": 1400611266.0, + "step": 4871 + }, + { + "epoch": 1.7351736420302761, + "grad_norm": 0.4743720293045044, + "learning_rate": 1e-06, + "loss": 0.6752, + "mean_token_accuracy": 0.7833226770162582, + "num_tokens": 1400894033.0, + "step": 4872 + }, + { + "epoch": 1.7355298308103295, + "grad_norm": 0.44022858142852783, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7578434646129608, + "num_tokens": 1401188150.0, + "step": 4873 + }, + { + "epoch": 1.735886019590383, + "grad_norm": 0.4864477515220642, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7678906172513962, + "num_tokens": 1401483176.0, + "step": 4874 + }, + { + "epoch": 1.7362422083704363, + "grad_norm": 0.47646084427833557, + "learning_rate": 1e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.7612247318029404, + "num_tokens": 1401782500.0, + "step": 4875 + }, + { + "epoch": 1.7365983971504897, + "grad_norm": 0.46689078211784363, + "learning_rate": 1e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7734840363264084, + "num_tokens": 1402074939.0, + "step": 4876 + }, + { + "epoch": 1.736954585930543, + "grad_norm": 0.40838852524757385, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7741734087467194, + "num_tokens": 1402396749.0, + "step": 4877 + }, + { + "epoch": 1.7373107747105965, + "grad_norm": 0.44429853558540344, + "learning_rate": 1e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.758384644985199, + "num_tokens": 1402722856.0, + "step": 4878 + }, + { + "epoch": 1.73766696349065, + "grad_norm": 0.47378331422805786, + "learning_rate": 1e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7675574123859406, + "num_tokens": 1403008369.0, + "step": 4879 + }, + { + "epoch": 1.7380231522707035, + "grad_norm": 0.4510765075683594, + "learning_rate": 1e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7648454010486603, + "num_tokens": 1403303239.0, + "step": 4880 + }, + { + "epoch": 1.7383793410507569, + "grad_norm": 0.46514999866485596, + "learning_rate": 1e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7718549221754074, + "num_tokens": 1403577351.0, + "step": 4881 + }, + { + "epoch": 1.7387355298308105, + "grad_norm": 0.4847900867462158, + "learning_rate": 1e-06, + "loss": 0.7061, + "mean_token_accuracy": 0.772596925497055, + "num_tokens": 1403874500.0, + "step": 4882 + }, + { + "epoch": 1.7390917186108639, + "grad_norm": 0.49432316422462463, + "learning_rate": 1e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7521052807569504, + "num_tokens": 1404144628.0, + "step": 4883 + }, + { + "epoch": 1.7394479073909173, + "grad_norm": 0.44320058822631836, + "learning_rate": 1e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.7781669646501541, + "num_tokens": 1404439252.0, + "step": 4884 + }, + { + "epoch": 1.7398040961709706, + "grad_norm": 0.5734047293663025, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7587546408176422, + "num_tokens": 1404656675.0, + "step": 4885 + }, + { + "epoch": 1.740160284951024, + "grad_norm": 0.45069634914398193, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7512924373149872, + "num_tokens": 1404973909.0, + "step": 4886 + }, + { + "epoch": 1.7405164737310774, + "grad_norm": 0.44126078486442566, + "learning_rate": 1e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.75563083589077, + "num_tokens": 1405276966.0, + "step": 4887 + }, + { + "epoch": 1.7408726625111308, + "grad_norm": 0.4688810408115387, + "learning_rate": 1e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7740607708692551, + "num_tokens": 1405555689.0, + "step": 4888 + }, + { + "epoch": 1.7412288512911842, + "grad_norm": 0.4741400182247162, + "learning_rate": 1e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.7518423795700073, + "num_tokens": 1405847580.0, + "step": 4889 + }, + { + "epoch": 1.7415850400712376, + "grad_norm": 0.5076783299446106, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7462662607431412, + "num_tokens": 1406123847.0, + "step": 4890 + }, + { + "epoch": 1.7419412288512912, + "grad_norm": 0.4836253821849823, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7516747564077377, + "num_tokens": 1406415245.0, + "step": 4891 + }, + { + "epoch": 1.7422974176313446, + "grad_norm": 0.5257384777069092, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7758273482322693, + "num_tokens": 1406677125.0, + "step": 4892 + }, + { + "epoch": 1.742653606411398, + "grad_norm": 0.46007680892944336, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7633692175149918, + "num_tokens": 1406941629.0, + "step": 4893 + }, + { + "epoch": 1.7430097951914516, + "grad_norm": 0.5080963373184204, + "learning_rate": 1e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.7792712450027466, + "num_tokens": 1407205777.0, + "step": 4894 + }, + { + "epoch": 1.743365983971505, + "grad_norm": 0.42861250042915344, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7797596752643585, + "num_tokens": 1407500912.0, + "step": 4895 + }, + { + "epoch": 1.7437221727515584, + "grad_norm": 0.46536052227020264, + "learning_rate": 1e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.7747104614973068, + "num_tokens": 1407809377.0, + "step": 4896 + }, + { + "epoch": 1.7440783615316118, + "grad_norm": 0.5259337425231934, + "learning_rate": 1e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7766791135072708, + "num_tokens": 1408084422.0, + "step": 4897 + }, + { + "epoch": 1.7444345503116652, + "grad_norm": 0.45191940665245056, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7743185758590698, + "num_tokens": 1408361246.0, + "step": 4898 + }, + { + "epoch": 1.7447907390917186, + "grad_norm": 0.4496272802352905, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7673890143632889, + "num_tokens": 1408683040.0, + "step": 4899 + }, + { + "epoch": 1.745146927871772, + "grad_norm": 0.4482289254665375, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7602032274007797, + "num_tokens": 1408979662.0, + "step": 4900 + }, + { + "epoch": 1.7455031166518253, + "grad_norm": 0.4422331750392914, + "learning_rate": 1e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.781426340341568, + "num_tokens": 1409280776.0, + "step": 4901 + }, + { + "epoch": 1.745859305431879, + "grad_norm": 0.490465372800827, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7472648322582245, + "num_tokens": 1409552999.0, + "step": 4902 + }, + { + "epoch": 1.7462154942119323, + "grad_norm": 0.49821606278419495, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.767148494720459, + "num_tokens": 1409822170.0, + "step": 4903 + }, + { + "epoch": 1.7465716829919857, + "grad_norm": 0.5190935730934143, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7442934066057205, + "num_tokens": 1410089357.0, + "step": 4904 + }, + { + "epoch": 1.7469278717720393, + "grad_norm": 0.4680155813694, + "learning_rate": 1e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7551634758710861, + "num_tokens": 1410374716.0, + "step": 4905 + }, + { + "epoch": 1.7472840605520927, + "grad_norm": 0.47818729281425476, + "learning_rate": 1e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.7737360000610352, + "num_tokens": 1410670886.0, + "step": 4906 + }, + { + "epoch": 1.7476402493321461, + "grad_norm": 0.4773673415184021, + "learning_rate": 1e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7647832632064819, + "num_tokens": 1410957758.0, + "step": 4907 + }, + { + "epoch": 1.7479964381121995, + "grad_norm": 0.45205816626548767, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.758735716342926, + "num_tokens": 1411282715.0, + "step": 4908 + }, + { + "epoch": 1.748352626892253, + "grad_norm": 0.46961313486099243, + "learning_rate": 1e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.776624858379364, + "num_tokens": 1411607385.0, + "step": 4909 + }, + { + "epoch": 1.7487088156723063, + "grad_norm": 0.5141846537590027, + "learning_rate": 1e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7615569978952408, + "num_tokens": 1411861465.0, + "step": 4910 + }, + { + "epoch": 1.7490650044523597, + "grad_norm": 0.4531620144844055, + "learning_rate": 1e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7520982027053833, + "num_tokens": 1412134514.0, + "step": 4911 + }, + { + "epoch": 1.749421193232413, + "grad_norm": 0.47967660427093506, + "learning_rate": 1e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.7534395456314087, + "num_tokens": 1412414491.0, + "step": 4912 + }, + { + "epoch": 1.7497773820124665, + "grad_norm": 0.46952226758003235, + "learning_rate": 1e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7663834542036057, + "num_tokens": 1412700175.0, + "step": 4913 + }, + { + "epoch": 1.75013357079252, + "grad_norm": 0.45590662956237793, + "learning_rate": 1e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7686755508184433, + "num_tokens": 1413005403.0, + "step": 4914 + }, + { + "epoch": 1.7504897595725735, + "grad_norm": 0.46346622705459595, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7605626583099365, + "num_tokens": 1413299741.0, + "step": 4915 + }, + { + "epoch": 1.7508459483526269, + "grad_norm": 0.4580599069595337, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.7603179216384888, + "num_tokens": 1413626724.0, + "step": 4916 + }, + { + "epoch": 1.7512021371326805, + "grad_norm": 0.49663111567497253, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7652492821216583, + "num_tokens": 1413901633.0, + "step": 4917 + }, + { + "epoch": 1.7515583259127339, + "grad_norm": 0.4882447123527527, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7505339682102203, + "num_tokens": 1414182595.0, + "step": 4918 + }, + { + "epoch": 1.7519145146927873, + "grad_norm": 0.44576495885849, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.754085049033165, + "num_tokens": 1414479041.0, + "step": 4919 + }, + { + "epoch": 1.7522707034728406, + "grad_norm": 0.4732472896575928, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7584635615348816, + "num_tokens": 1414787722.0, + "step": 4920 + }, + { + "epoch": 1.752626892252894, + "grad_norm": 0.4746793210506439, + "learning_rate": 1e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7646861672401428, + "num_tokens": 1415073025.0, + "step": 4921 + }, + { + "epoch": 1.7529830810329474, + "grad_norm": 0.4494970738887787, + "learning_rate": 1e-06, + "loss": 0.7159, + "mean_token_accuracy": 0.7721911817789078, + "num_tokens": 1415343058.0, + "step": 4922 + }, + { + "epoch": 1.7533392698130008, + "grad_norm": 0.4869566857814789, + "learning_rate": 1e-06, + "loss": 0.6764, + "mean_token_accuracy": 0.7882023304700851, + "num_tokens": 1415597932.0, + "step": 4923 + }, + { + "epoch": 1.7536954585930542, + "grad_norm": 0.4670393466949463, + "learning_rate": 1e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7707237601280212, + "num_tokens": 1415875907.0, + "step": 4924 + }, + { + "epoch": 1.7540516473731076, + "grad_norm": 0.45626452565193176, + "learning_rate": 1e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7698001116514206, + "num_tokens": 1416146721.0, + "step": 4925 + }, + { + "epoch": 1.7544078361531612, + "grad_norm": 0.5093575716018677, + "learning_rate": 1e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7562957108020782, + "num_tokens": 1416428162.0, + "step": 4926 + }, + { + "epoch": 1.7547640249332146, + "grad_norm": 0.45146623253822327, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7665584236383438, + "num_tokens": 1416728530.0, + "step": 4927 + }, + { + "epoch": 1.755120213713268, + "grad_norm": 0.41679292917251587, + "learning_rate": 1e-06, + "loss": 0.6617, + "mean_token_accuracy": 0.7851304560899734, + "num_tokens": 1417017108.0, + "step": 4928 + }, + { + "epoch": 1.7554764024933216, + "grad_norm": 0.5391764044761658, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7672652751207352, + "num_tokens": 1417283225.0, + "step": 4929 + }, + { + "epoch": 1.755832591273375, + "grad_norm": 0.4576353430747986, + "learning_rate": 1e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7613799124956131, + "num_tokens": 1417570840.0, + "step": 4930 + }, + { + "epoch": 1.7561887800534284, + "grad_norm": 0.46961426734924316, + "learning_rate": 1e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7577026188373566, + "num_tokens": 1417858395.0, + "step": 4931 + }, + { + "epoch": 1.7565449688334818, + "grad_norm": 0.4764329493045807, + "learning_rate": 1e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7595860064029694, + "num_tokens": 1418119091.0, + "step": 4932 + }, + { + "epoch": 1.7569011576135352, + "grad_norm": 0.41825929284095764, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.763482853770256, + "num_tokens": 1418422665.0, + "step": 4933 + }, + { + "epoch": 1.7572573463935885, + "grad_norm": 0.526360273361206, + "learning_rate": 1e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7677215188741684, + "num_tokens": 1418697358.0, + "step": 4934 + }, + { + "epoch": 1.757613535173642, + "grad_norm": 0.4651629626750946, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7477896064519882, + "num_tokens": 1419011105.0, + "step": 4935 + }, + { + "epoch": 1.7579697239536953, + "grad_norm": 0.47360414266586304, + "learning_rate": 1e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7773940116167068, + "num_tokens": 1419258830.0, + "step": 4936 + }, + { + "epoch": 1.758325912733749, + "grad_norm": 0.48002392053604126, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7668273001909256, + "num_tokens": 1419524718.0, + "step": 4937 + }, + { + "epoch": 1.7586821015138023, + "grad_norm": 0.4836137294769287, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.766690731048584, + "num_tokens": 1419812998.0, + "step": 4938 + }, + { + "epoch": 1.7590382902938557, + "grad_norm": 0.4972172975540161, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7488318532705307, + "num_tokens": 1420075007.0, + "step": 4939 + }, + { + "epoch": 1.7593944790739093, + "grad_norm": 0.5023255348205566, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7477967292070389, + "num_tokens": 1420359648.0, + "step": 4940 + }, + { + "epoch": 1.7597506678539627, + "grad_norm": 0.4648487865924835, + "learning_rate": 1e-06, + "loss": 0.6451, + "mean_token_accuracy": 0.7982029616832733, + "num_tokens": 1420654442.0, + "step": 4941 + }, + { + "epoch": 1.7601068566340161, + "grad_norm": 0.48389092087745667, + "learning_rate": 1e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7664521485567093, + "num_tokens": 1420907539.0, + "step": 4942 + }, + { + "epoch": 1.7604630454140695, + "grad_norm": 0.4436027407646179, + "learning_rate": 1e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.7826118767261505, + "num_tokens": 1421203019.0, + "step": 4943 + }, + { + "epoch": 1.760819234194123, + "grad_norm": 0.46293047070503235, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7791945785284042, + "num_tokens": 1421519382.0, + "step": 4944 + }, + { + "epoch": 1.7611754229741763, + "grad_norm": 0.4610631763935089, + "learning_rate": 1e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7757814675569534, + "num_tokens": 1421788424.0, + "step": 4945 + }, + { + "epoch": 1.7615316117542297, + "grad_norm": 0.4501914381980896, + "learning_rate": 1e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.7753315269947052, + "num_tokens": 1422081904.0, + "step": 4946 + }, + { + "epoch": 1.761887800534283, + "grad_norm": 0.46036630868911743, + "learning_rate": 1e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.75776706635952, + "num_tokens": 1422389473.0, + "step": 4947 + }, + { + "epoch": 1.7622439893143365, + "grad_norm": 0.47117698192596436, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7567733973264694, + "num_tokens": 1422680227.0, + "step": 4948 + }, + { + "epoch": 1.76260017809439, + "grad_norm": 0.46426892280578613, + "learning_rate": 1e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7784751504659653, + "num_tokens": 1422949987.0, + "step": 4949 + }, + { + "epoch": 1.7629563668744435, + "grad_norm": 0.47955775260925293, + "learning_rate": 1e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.7730938792228699, + "num_tokens": 1423222059.0, + "step": 4950 + }, + { + "epoch": 1.7633125556544968, + "grad_norm": 0.477071076631546, + "learning_rate": 1e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7753613889217377, + "num_tokens": 1423518242.0, + "step": 4951 + }, + { + "epoch": 1.7636687444345505, + "grad_norm": 0.4171934723854065, + "learning_rate": 1e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7750624716281891, + "num_tokens": 1423844111.0, + "step": 4952 + }, + { + "epoch": 1.7640249332146039, + "grad_norm": 0.44139158725738525, + "learning_rate": 1e-06, + "loss": 0.6907, + "mean_token_accuracy": 0.7742096185684204, + "num_tokens": 1424160129.0, + "step": 4953 + }, + { + "epoch": 1.7643811219946572, + "grad_norm": 0.4768664240837097, + "learning_rate": 1e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7710383832454681, + "num_tokens": 1424436189.0, + "step": 4954 + }, + { + "epoch": 1.7647373107747106, + "grad_norm": 0.46012482047080994, + "learning_rate": 1e-06, + "loss": 0.6239, + "mean_token_accuracy": 0.7900554090738297, + "num_tokens": 1424760780.0, + "step": 4955 + }, + { + "epoch": 1.765093499554764, + "grad_norm": 0.4484231173992157, + "learning_rate": 1e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7653712779283524, + "num_tokens": 1425072230.0, + "step": 4956 + }, + { + "epoch": 1.7654496883348174, + "grad_norm": 0.4541459083557129, + "learning_rate": 1e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7764205783605576, + "num_tokens": 1425400661.0, + "step": 4957 + }, + { + "epoch": 1.7658058771148708, + "grad_norm": 0.4742152988910675, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7620575726032257, + "num_tokens": 1425667375.0, + "step": 4958 + }, + { + "epoch": 1.7661620658949242, + "grad_norm": 0.51849365234375, + "learning_rate": 1e-06, + "loss": 0.71, + "mean_token_accuracy": 0.768901526927948, + "num_tokens": 1425944001.0, + "step": 4959 + }, + { + "epoch": 1.7665182546749776, + "grad_norm": 0.5043038129806519, + "learning_rate": 1e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.7594294399023056, + "num_tokens": 1426219126.0, + "step": 4960 + }, + { + "epoch": 1.7668744434550312, + "grad_norm": 0.4705956280231476, + "learning_rate": 1e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.7694370597600937, + "num_tokens": 1426528947.0, + "step": 4961 + }, + { + "epoch": 1.7672306322350846, + "grad_norm": 0.4849172830581665, + "learning_rate": 1e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7600206285715103, + "num_tokens": 1426806691.0, + "step": 4962 + }, + { + "epoch": 1.767586821015138, + "grad_norm": 0.44173920154571533, + "learning_rate": 1e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7458735406398773, + "num_tokens": 1427122812.0, + "step": 4963 + }, + { + "epoch": 1.7679430097951916, + "grad_norm": 0.45703303813934326, + "learning_rate": 1e-06, + "loss": 0.6836, + "mean_token_accuracy": 0.781115785241127, + "num_tokens": 1427387673.0, + "step": 4964 + }, + { + "epoch": 1.768299198575245, + "grad_norm": 0.4521693289279938, + "learning_rate": 1e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.78129643201828, + "num_tokens": 1427688261.0, + "step": 4965 + }, + { + "epoch": 1.7686553873552984, + "grad_norm": 0.49937716126441956, + "learning_rate": 1e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7722299695014954, + "num_tokens": 1427967740.0, + "step": 4966 + }, + { + "epoch": 1.7690115761353518, + "grad_norm": 0.465476393699646, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7589352577924728, + "num_tokens": 1428246583.0, + "step": 4967 + }, + { + "epoch": 1.7693677649154052, + "grad_norm": 0.46288999915122986, + "learning_rate": 1e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.7848374098539352, + "num_tokens": 1428547085.0, + "step": 4968 + }, + { + "epoch": 1.7697239536954585, + "grad_norm": 0.481947660446167, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7665159106254578, + "num_tokens": 1428819503.0, + "step": 4969 + }, + { + "epoch": 1.770080142475512, + "grad_norm": 0.4844488501548767, + "learning_rate": 1e-06, + "loss": 0.711, + "mean_token_accuracy": 0.7695436626672745, + "num_tokens": 1429107331.0, + "step": 4970 + }, + { + "epoch": 1.7704363312555653, + "grad_norm": 0.4541338384151459, + "learning_rate": 1e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7563586235046387, + "num_tokens": 1429394045.0, + "step": 4971 + }, + { + "epoch": 1.770792520035619, + "grad_norm": 0.4921478033065796, + "learning_rate": 1e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.774490013718605, + "num_tokens": 1429690424.0, + "step": 4972 + }, + { + "epoch": 1.7711487088156723, + "grad_norm": 0.4512507915496826, + "learning_rate": 1e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7757794409990311, + "num_tokens": 1429991466.0, + "step": 4973 + }, + { + "epoch": 1.7715048975957257, + "grad_norm": 0.45626088976860046, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7573825120925903, + "num_tokens": 1430295667.0, + "step": 4974 + }, + { + "epoch": 1.7718610863757793, + "grad_norm": 0.4433935880661011, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7668687850236893, + "num_tokens": 1430611648.0, + "step": 4975 + }, + { + "epoch": 1.7722172751558327, + "grad_norm": 0.4540022611618042, + "learning_rate": 1e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.7869250923395157, + "num_tokens": 1430946395.0, + "step": 4976 + }, + { + "epoch": 1.772573463935886, + "grad_norm": 0.4370071291923523, + "learning_rate": 1e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7562147825956345, + "num_tokens": 1431265528.0, + "step": 4977 + }, + { + "epoch": 1.7729296527159395, + "grad_norm": 0.47841906547546387, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7621431350708008, + "num_tokens": 1431526125.0, + "step": 4978 + }, + { + "epoch": 1.7732858414959929, + "grad_norm": 0.48499795794487, + "learning_rate": 1e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7753695547580719, + "num_tokens": 1431809758.0, + "step": 4979 + }, + { + "epoch": 1.7736420302760463, + "grad_norm": 0.49711382389068604, + "learning_rate": 1e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.7699595093727112, + "num_tokens": 1432106198.0, + "step": 4980 + }, + { + "epoch": 1.7739982190560997, + "grad_norm": 0.45215705037117004, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7653499990701675, + "num_tokens": 1432432194.0, + "step": 4981 + }, + { + "epoch": 1.774354407836153, + "grad_norm": 0.5064123272895813, + "learning_rate": 1e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7680815756320953, + "num_tokens": 1432739274.0, + "step": 4982 + }, + { + "epoch": 1.7747105966162064, + "grad_norm": 0.49193891882896423, + "learning_rate": 1e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7519850730895996, + "num_tokens": 1433002125.0, + "step": 4983 + }, + { + "epoch": 1.77506678539626, + "grad_norm": 0.4539547264575958, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7685909569263458, + "num_tokens": 1433297174.0, + "step": 4984 + }, + { + "epoch": 1.7754229741763135, + "grad_norm": 0.47042423486709595, + "learning_rate": 1e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7774676233530045, + "num_tokens": 1433570466.0, + "step": 4985 + }, + { + "epoch": 1.7757791629563668, + "grad_norm": 0.4572932720184326, + "learning_rate": 1e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7749839127063751, + "num_tokens": 1433861738.0, + "step": 4986 + }, + { + "epoch": 1.7761353517364205, + "grad_norm": 0.4585943818092346, + "learning_rate": 1e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7656600922346115, + "num_tokens": 1434164353.0, + "step": 4987 + }, + { + "epoch": 1.7764915405164738, + "grad_norm": 0.47072264552116394, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7605038583278656, + "num_tokens": 1434475072.0, + "step": 4988 + }, + { + "epoch": 1.7768477292965272, + "grad_norm": 0.46228575706481934, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7686916291713715, + "num_tokens": 1434754952.0, + "step": 4989 + }, + { + "epoch": 1.7772039180765806, + "grad_norm": 0.47335606813430786, + "learning_rate": 1e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7678328603506088, + "num_tokens": 1435018628.0, + "step": 4990 + }, + { + "epoch": 1.777560106856634, + "grad_norm": 0.4606415629386902, + "learning_rate": 1e-06, + "loss": 0.6955, + "mean_token_accuracy": 0.7783032804727554, + "num_tokens": 1435325936.0, + "step": 4991 + }, + { + "epoch": 1.7779162956366874, + "grad_norm": 0.4799001216888428, + "learning_rate": 1e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.7763863801956177, + "num_tokens": 1435610701.0, + "step": 4992 + }, + { + "epoch": 1.7782724844167408, + "grad_norm": 0.5112259984016418, + "learning_rate": 1e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.7787049263715744, + "num_tokens": 1435885404.0, + "step": 4993 + }, + { + "epoch": 1.7786286731967942, + "grad_norm": 0.46505650877952576, + "learning_rate": 1e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.7687930911779404, + "num_tokens": 1436167930.0, + "step": 4994 + }, + { + "epoch": 1.7789848619768476, + "grad_norm": 0.46825066208839417, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7717665880918503, + "num_tokens": 1436469460.0, + "step": 4995 + }, + { + "epoch": 1.7793410507569012, + "grad_norm": 0.4572441577911377, + "learning_rate": 1e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7716351002454758, + "num_tokens": 1436770810.0, + "step": 4996 + }, + { + "epoch": 1.7796972395369546, + "grad_norm": 0.49278444051742554, + "learning_rate": 1e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.764374628663063, + "num_tokens": 1437066050.0, + "step": 4997 + }, + { + "epoch": 1.780053428317008, + "grad_norm": 0.45930016040802, + "learning_rate": 1e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7667940109968185, + "num_tokens": 1437366191.0, + "step": 4998 + }, + { + "epoch": 1.7804096170970616, + "grad_norm": 0.4755350351333618, + "learning_rate": 1e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.7745906114578247, + "num_tokens": 1437695530.0, + "step": 4999 + }, + { + "epoch": 1.780765805877115, + "grad_norm": 0.46924182772636414, + "learning_rate": 1e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.7938815504312515, + "num_tokens": 1437979827.0, + "step": 5000 + }, + { + "epoch": 1.7811219946571684, + "grad_norm": 0.4433876574039459, + "learning_rate": 1e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.7740974277257919, + "num_tokens": 1438265935.0, + "step": 5001 + }, + { + "epoch": 1.7814781834372218, + "grad_norm": 0.43404334783554077, + "learning_rate": 1e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7588852047920227, + "num_tokens": 1438585782.0, + "step": 5002 + }, + { + "epoch": 1.7818343722172751, + "grad_norm": 0.4763874411582947, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.772974818944931, + "num_tokens": 1438889065.0, + "step": 5003 + }, + { + "epoch": 1.7821905609973285, + "grad_norm": 0.4338836967945099, + "learning_rate": 1e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7635460644960403, + "num_tokens": 1439208281.0, + "step": 5004 + }, + { + "epoch": 1.782546749777382, + "grad_norm": 0.46046537160873413, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7571267783641815, + "num_tokens": 1439506982.0, + "step": 5005 + }, + { + "epoch": 1.7829029385574353, + "grad_norm": 0.4701445400714874, + "learning_rate": 1e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.7765538990497589, + "num_tokens": 1439776171.0, + "step": 5006 + }, + { + "epoch": 1.7832591273374887, + "grad_norm": 0.4563065767288208, + "learning_rate": 1e-06, + "loss": 0.651, + "mean_token_accuracy": 0.7846096158027649, + "num_tokens": 1440086302.0, + "step": 5007 + }, + { + "epoch": 1.7836153161175423, + "grad_norm": 0.4795377254486084, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.767856776714325, + "num_tokens": 1440344461.0, + "step": 5008 + }, + { + "epoch": 1.7839715048975957, + "grad_norm": 0.47223585844039917, + "learning_rate": 1e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.7714156061410904, + "num_tokens": 1440613774.0, + "step": 5009 + }, + { + "epoch": 1.7843276936776493, + "grad_norm": 0.4276212155818939, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7622044235467911, + "num_tokens": 1440915114.0, + "step": 5010 + }, + { + "epoch": 1.7846838824577027, + "grad_norm": 0.4325282871723175, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7511838376522064, + "num_tokens": 1441225644.0, + "step": 5011 + }, + { + "epoch": 1.785040071237756, + "grad_norm": 0.49421265721321106, + "learning_rate": 1e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7621786445379257, + "num_tokens": 1441518852.0, + "step": 5012 + }, + { + "epoch": 1.7853962600178095, + "grad_norm": 0.48489874601364136, + "learning_rate": 1e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7580490410327911, + "num_tokens": 1441784253.0, + "step": 5013 + }, + { + "epoch": 1.7857524487978629, + "grad_norm": 0.4791080057621002, + "learning_rate": 1e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7793430984020233, + "num_tokens": 1442048296.0, + "step": 5014 + }, + { + "epoch": 1.7861086375779163, + "grad_norm": 0.5070396065711975, + "learning_rate": 1e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7571821957826614, + "num_tokens": 1442339726.0, + "step": 5015 + }, + { + "epoch": 1.7864648263579697, + "grad_norm": 0.504291832447052, + "learning_rate": 1e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7539020329713821, + "num_tokens": 1442596072.0, + "step": 5016 + }, + { + "epoch": 1.786821015138023, + "grad_norm": 0.5057506561279297, + "learning_rate": 1e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7585771381855011, + "num_tokens": 1442879331.0, + "step": 5017 + }, + { + "epoch": 1.7871772039180764, + "grad_norm": 0.49487969279289246, + "learning_rate": 1e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7458631545305252, + "num_tokens": 1443173724.0, + "step": 5018 + }, + { + "epoch": 1.78753339269813, + "grad_norm": 0.4594346284866333, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7748195677995682, + "num_tokens": 1443482556.0, + "step": 5019 + }, + { + "epoch": 1.7878895814781834, + "grad_norm": 0.43257999420166016, + "learning_rate": 1e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.7878180295228958, + "num_tokens": 1443788419.0, + "step": 5020 + }, + { + "epoch": 1.7882457702582368, + "grad_norm": 0.47670841217041016, + "learning_rate": 1e-06, + "loss": 0.6659, + "mean_token_accuracy": 0.7825391888618469, + "num_tokens": 1444071614.0, + "step": 5021 + }, + { + "epoch": 1.7886019590382904, + "grad_norm": 0.4994124174118042, + "learning_rate": 1e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7530842572450638, + "num_tokens": 1444328682.0, + "step": 5022 + }, + { + "epoch": 1.7889581478183438, + "grad_norm": 0.46761971712112427, + "learning_rate": 1e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7675536870956421, + "num_tokens": 1444623421.0, + "step": 5023 + }, + { + "epoch": 1.7893143365983972, + "grad_norm": 0.5098282098770142, + "learning_rate": 1e-06, + "loss": 0.6688, + "mean_token_accuracy": 0.784207358956337, + "num_tokens": 1444908405.0, + "step": 5024 + }, + { + "epoch": 1.7896705253784506, + "grad_norm": 0.4628503620624542, + "learning_rate": 1e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7662287205457687, + "num_tokens": 1445200393.0, + "step": 5025 + }, + { + "epoch": 1.790026714158504, + "grad_norm": 0.4417584538459778, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.759446993470192, + "num_tokens": 1445485272.0, + "step": 5026 + }, + { + "epoch": 1.7903829029385574, + "grad_norm": 0.44220560789108276, + "learning_rate": 1e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7591081857681274, + "num_tokens": 1445799327.0, + "step": 5027 + }, + { + "epoch": 1.7907390917186108, + "grad_norm": 0.46723878383636475, + "learning_rate": 1e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.7812891900539398, + "num_tokens": 1446115235.0, + "step": 5028 + }, + { + "epoch": 1.7910952804986642, + "grad_norm": 0.47400468587875366, + "learning_rate": 1e-06, + "loss": 0.6882, + "mean_token_accuracy": 0.7800759673118591, + "num_tokens": 1446399502.0, + "step": 5029 + }, + { + "epoch": 1.7914514692787176, + "grad_norm": 0.4516262412071228, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7418945729732513, + "num_tokens": 1446692393.0, + "step": 5030 + }, + { + "epoch": 1.7918076580587712, + "grad_norm": 0.4651176631450653, + "learning_rate": 1e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7701784521341324, + "num_tokens": 1446944378.0, + "step": 5031 + }, + { + "epoch": 1.7921638468388246, + "grad_norm": 0.47850409150123596, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7628382593393326, + "num_tokens": 1447246797.0, + "step": 5032 + }, + { + "epoch": 1.792520035618878, + "grad_norm": 0.44021546840667725, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7672459036111832, + "num_tokens": 1447551834.0, + "step": 5033 + }, + { + "epoch": 1.7928762243989316, + "grad_norm": 0.46973440051078796, + "learning_rate": 1e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.7756233960390091, + "num_tokens": 1447868520.0, + "step": 5034 + }, + { + "epoch": 1.793232413178985, + "grad_norm": 0.3957628607749939, + "learning_rate": 1e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.780648946762085, + "num_tokens": 1448218458.0, + "step": 5035 + }, + { + "epoch": 1.7935886019590384, + "grad_norm": 0.5117723941802979, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7580257952213287, + "num_tokens": 1448503685.0, + "step": 5036 + }, + { + "epoch": 1.7939447907390917, + "grad_norm": 0.5406283140182495, + "learning_rate": 1e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7599017173051834, + "num_tokens": 1448762022.0, + "step": 5037 + }, + { + "epoch": 1.7943009795191451, + "grad_norm": 0.4820196032524109, + "learning_rate": 1e-06, + "loss": 0.7095, + "mean_token_accuracy": 0.769489586353302, + "num_tokens": 1449033503.0, + "step": 5038 + }, + { + "epoch": 1.7946571682991985, + "grad_norm": 0.44399386644363403, + "learning_rate": 1e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7672622054815292, + "num_tokens": 1449345341.0, + "step": 5039 + }, + { + "epoch": 1.795013357079252, + "grad_norm": 0.4457972049713135, + "learning_rate": 1e-06, + "loss": 0.7079, + "mean_token_accuracy": 0.7750712633132935, + "num_tokens": 1449680319.0, + "step": 5040 + }, + { + "epoch": 1.7953695458593053, + "grad_norm": 0.49701231718063354, + "learning_rate": 1e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7556134313344955, + "num_tokens": 1449971707.0, + "step": 5041 + }, + { + "epoch": 1.7957257346393587, + "grad_norm": 0.5149100422859192, + "learning_rate": 1e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7590678483247757, + "num_tokens": 1450239529.0, + "step": 5042 + }, + { + "epoch": 1.7960819234194123, + "grad_norm": 0.45531177520751953, + "learning_rate": 1e-06, + "loss": 0.6793, + "mean_token_accuracy": 0.7844183593988419, + "num_tokens": 1450536692.0, + "step": 5043 + }, + { + "epoch": 1.7964381121994657, + "grad_norm": 0.4398915767669678, + "learning_rate": 1e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7613511383533478, + "num_tokens": 1450858238.0, + "step": 5044 + }, + { + "epoch": 1.7967943009795193, + "grad_norm": 0.49002575874328613, + "learning_rate": 1e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7692277133464813, + "num_tokens": 1451150959.0, + "step": 5045 + }, + { + "epoch": 1.7971504897595727, + "grad_norm": 0.5275981426239014, + "learning_rate": 1e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7599822133779526, + "num_tokens": 1451410394.0, + "step": 5046 + }, + { + "epoch": 1.797506678539626, + "grad_norm": 0.5356876254081726, + "learning_rate": 1e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7522751241922379, + "num_tokens": 1451677030.0, + "step": 5047 + }, + { + "epoch": 1.7978628673196795, + "grad_norm": 0.5483888387680054, + "learning_rate": 1e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7566877752542496, + "num_tokens": 1451917868.0, + "step": 5048 + }, + { + "epoch": 1.7982190560997329, + "grad_norm": 0.4356825053691864, + "learning_rate": 1e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7569262385368347, + "num_tokens": 1452214255.0, + "step": 5049 + }, + { + "epoch": 1.7985752448797863, + "grad_norm": 0.44331881403923035, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7647330313920975, + "num_tokens": 1452517599.0, + "step": 5050 + }, + { + "epoch": 1.7989314336598397, + "grad_norm": 0.5264084339141846, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7508241087198257, + "num_tokens": 1452820154.0, + "step": 5051 + }, + { + "epoch": 1.799287622439893, + "grad_norm": 0.4618173837661743, + "learning_rate": 1e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.7931118309497833, + "num_tokens": 1453149020.0, + "step": 5052 + }, + { + "epoch": 1.7996438112199464, + "grad_norm": 0.5058026909828186, + "learning_rate": 1e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7537569403648376, + "num_tokens": 1453416578.0, + "step": 5053 + }, + { + "epoch": 1.8, + "grad_norm": 0.437029093503952, + "learning_rate": 1e-06, + "loss": 0.6649, + "mean_token_accuracy": 0.7822331190109253, + "num_tokens": 1453682856.0, + "step": 5054 + }, + { + "epoch": 1.8003561887800534, + "grad_norm": 0.48113805055618286, + "learning_rate": 1e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.761157214641571, + "num_tokens": 1453941848.0, + "step": 5055 + }, + { + "epoch": 1.8007123775601068, + "grad_norm": 0.454449325799942, + "learning_rate": 1e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7770180702209473, + "num_tokens": 1454205919.0, + "step": 5056 + }, + { + "epoch": 1.8010685663401604, + "grad_norm": 0.4768904447555542, + "learning_rate": 1e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7691348493099213, + "num_tokens": 1454474538.0, + "step": 5057 + }, + { + "epoch": 1.8014247551202138, + "grad_norm": 0.5051757097244263, + "learning_rate": 1e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7584786713123322, + "num_tokens": 1454777153.0, + "step": 5058 + }, + { + "epoch": 1.8017809439002672, + "grad_norm": 0.5103880167007446, + "learning_rate": 1e-06, + "loss": 0.6473, + "mean_token_accuracy": 0.7895405143499374, + "num_tokens": 1455040629.0, + "step": 5059 + }, + { + "epoch": 1.8021371326803206, + "grad_norm": 0.45734208822250366, + "learning_rate": 1e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7723913490772247, + "num_tokens": 1455358505.0, + "step": 5060 + }, + { + "epoch": 1.802493321460374, + "grad_norm": 0.46347129344940186, + "learning_rate": 1e-06, + "loss": 0.7621, + "mean_token_accuracy": 0.7554830610752106, + "num_tokens": 1455627883.0, + "step": 5061 + }, + { + "epoch": 1.8028495102404274, + "grad_norm": 0.422134667634964, + "learning_rate": 1e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.7659169733524323, + "num_tokens": 1455945432.0, + "step": 5062 + }, + { + "epoch": 1.8032056990204808, + "grad_norm": 0.4263858497142792, + "learning_rate": 1e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.76142618060112, + "num_tokens": 1456257881.0, + "step": 5063 + }, + { + "epoch": 1.8035618878005342, + "grad_norm": 0.5158461332321167, + "learning_rate": 1e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.7647083848714828, + "num_tokens": 1456518228.0, + "step": 5064 + }, + { + "epoch": 1.8039180765805876, + "grad_norm": 0.4975389838218689, + "learning_rate": 1e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7596039026975632, + "num_tokens": 1456791055.0, + "step": 5065 + }, + { + "epoch": 1.8042742653606412, + "grad_norm": 0.477850079536438, + "learning_rate": 1e-06, + "loss": 0.6472, + "mean_token_accuracy": 0.7939428836107254, + "num_tokens": 1457069447.0, + "step": 5066 + }, + { + "epoch": 1.8046304541406946, + "grad_norm": 0.40570950508117676, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.765790730714798, + "num_tokens": 1457390750.0, + "step": 5067 + }, + { + "epoch": 1.804986642920748, + "grad_norm": 0.45380285382270813, + "learning_rate": 1e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7706494033336639, + "num_tokens": 1457680639.0, + "step": 5068 + }, + { + "epoch": 1.8053428317008016, + "grad_norm": 0.45807236433029175, + "learning_rate": 1e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7645492702722549, + "num_tokens": 1457974315.0, + "step": 5069 + }, + { + "epoch": 1.805699020480855, + "grad_norm": 0.4474529027938843, + "learning_rate": 1e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7714301496744156, + "num_tokens": 1458261351.0, + "step": 5070 + }, + { + "epoch": 1.8060552092609083, + "grad_norm": 0.4379662275314331, + "learning_rate": 1e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.7508169412612915, + "num_tokens": 1458591606.0, + "step": 5071 + }, + { + "epoch": 1.8064113980409617, + "grad_norm": 0.4487575590610504, + "learning_rate": 1e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.7706756293773651, + "num_tokens": 1458886213.0, + "step": 5072 + }, + { + "epoch": 1.8067675868210151, + "grad_norm": 0.5037705302238464, + "learning_rate": 1e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7715217620134354, + "num_tokens": 1459191173.0, + "step": 5073 + }, + { + "epoch": 1.8071237756010685, + "grad_norm": 0.446260005235672, + "learning_rate": 1e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7724399864673615, + "num_tokens": 1459476029.0, + "step": 5074 + }, + { + "epoch": 1.807479964381122, + "grad_norm": 0.49610719084739685, + "learning_rate": 1e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7665444016456604, + "num_tokens": 1459743965.0, + "step": 5075 + }, + { + "epoch": 1.8078361531611753, + "grad_norm": 0.48836010694503784, + "learning_rate": 1e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.7801372706890106, + "num_tokens": 1460041293.0, + "step": 5076 + }, + { + "epoch": 1.8081923419412287, + "grad_norm": 0.44363218545913696, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.772598072886467, + "num_tokens": 1460326696.0, + "step": 5077 + }, + { + "epoch": 1.8085485307212823, + "grad_norm": 0.45177075266838074, + "learning_rate": 1e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7682397067546844, + "num_tokens": 1460642747.0, + "step": 5078 + }, + { + "epoch": 1.8089047195013357, + "grad_norm": 0.4625159800052643, + "learning_rate": 1e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.7842716574668884, + "num_tokens": 1460905451.0, + "step": 5079 + }, + { + "epoch": 1.8092609082813893, + "grad_norm": 0.510180652141571, + "learning_rate": 1e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.754773423075676, + "num_tokens": 1461170378.0, + "step": 5080 + }, + { + "epoch": 1.8096170970614427, + "grad_norm": 0.49567747116088867, + "learning_rate": 1e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7740953862667084, + "num_tokens": 1461458868.0, + "step": 5081 + }, + { + "epoch": 1.809973285841496, + "grad_norm": 0.48882514238357544, + "learning_rate": 1e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7782317101955414, + "num_tokens": 1461753866.0, + "step": 5082 + }, + { + "epoch": 1.8103294746215495, + "grad_norm": 0.4123920798301697, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7635101974010468, + "num_tokens": 1462046623.0, + "step": 5083 + }, + { + "epoch": 1.8106856634016029, + "grad_norm": 0.5050338506698608, + "learning_rate": 1e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7656543105840683, + "num_tokens": 1462297872.0, + "step": 5084 + }, + { + "epoch": 1.8110418521816563, + "grad_norm": 0.47721683979034424, + "learning_rate": 1e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7621365636587143, + "num_tokens": 1462571043.0, + "step": 5085 + }, + { + "epoch": 1.8113980409617096, + "grad_norm": 0.45237448811531067, + "learning_rate": 1e-06, + "loss": 0.711, + "mean_token_accuracy": 0.7723067700862885, + "num_tokens": 1462881684.0, + "step": 5086 + }, + { + "epoch": 1.811754229741763, + "grad_norm": 0.490774542093277, + "learning_rate": 1e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7687295079231262, + "num_tokens": 1463161236.0, + "step": 5087 + }, + { + "epoch": 1.8121104185218164, + "grad_norm": 0.4550495147705078, + "learning_rate": 1e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7489631772041321, + "num_tokens": 1463450120.0, + "step": 5088 + }, + { + "epoch": 1.81246660730187, + "grad_norm": 0.48676797747612, + "learning_rate": 1e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7555049061775208, + "num_tokens": 1463736330.0, + "step": 5089 + }, + { + "epoch": 1.8128227960819234, + "grad_norm": 0.4898940622806549, + "learning_rate": 1e-06, + "loss": 0.6886, + "mean_token_accuracy": 0.7770703285932541, + "num_tokens": 1463990898.0, + "step": 5090 + }, + { + "epoch": 1.8131789848619768, + "grad_norm": 0.44728168845176697, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7720672786235809, + "num_tokens": 1464275452.0, + "step": 5091 + }, + { + "epoch": 1.8135351736420304, + "grad_norm": 0.47138524055480957, + "learning_rate": 1e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7708868831396103, + "num_tokens": 1464576855.0, + "step": 5092 + }, + { + "epoch": 1.8138913624220838, + "grad_norm": 0.4598880708217621, + "learning_rate": 1e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7496102899312973, + "num_tokens": 1464861793.0, + "step": 5093 + }, + { + "epoch": 1.8142475512021372, + "grad_norm": 0.44140809774398804, + "learning_rate": 1e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7708977907896042, + "num_tokens": 1465159531.0, + "step": 5094 + }, + { + "epoch": 1.8146037399821906, + "grad_norm": 0.4330042898654938, + "learning_rate": 1e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7723535299301147, + "num_tokens": 1465447573.0, + "step": 5095 + }, + { + "epoch": 1.814959928762244, + "grad_norm": 0.4662054777145386, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7628981918096542, + "num_tokens": 1465728110.0, + "step": 5096 + }, + { + "epoch": 1.8153161175422974, + "grad_norm": 0.46054282784461975, + "learning_rate": 1e-06, + "loss": 0.699, + "mean_token_accuracy": 0.7770748436450958, + "num_tokens": 1466019002.0, + "step": 5097 + }, + { + "epoch": 1.8156723063223508, + "grad_norm": 0.47532618045806885, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7639568597078323, + "num_tokens": 1466308172.0, + "step": 5098 + }, + { + "epoch": 1.8160284951024042, + "grad_norm": 0.4540393650531769, + "learning_rate": 1e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7609094828367233, + "num_tokens": 1466612723.0, + "step": 5099 + }, + { + "epoch": 1.8163846838824576, + "grad_norm": 0.5285928845405579, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7625218331813812, + "num_tokens": 1466868790.0, + "step": 5100 + }, + { + "epoch": 1.8167408726625112, + "grad_norm": 0.4777180552482605, + "learning_rate": 1e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7750281244516373, + "num_tokens": 1467173601.0, + "step": 5101 + }, + { + "epoch": 1.8170970614425646, + "grad_norm": 0.47297272086143494, + "learning_rate": 1e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7715339213609695, + "num_tokens": 1467442927.0, + "step": 5102 + }, + { + "epoch": 1.817453250222618, + "grad_norm": 0.4784405529499054, + "learning_rate": 1e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7796309292316437, + "num_tokens": 1467730948.0, + "step": 5103 + }, + { + "epoch": 1.8178094390026716, + "grad_norm": 0.4311677813529968, + "learning_rate": 1e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7740888744592667, + "num_tokens": 1468057815.0, + "step": 5104 + }, + { + "epoch": 1.818165627782725, + "grad_norm": 0.4602120518684387, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7734951674938202, + "num_tokens": 1468339604.0, + "step": 5105 + }, + { + "epoch": 1.8185218165627783, + "grad_norm": 0.4930037558078766, + "learning_rate": 1e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7627429217100143, + "num_tokens": 1468609710.0, + "step": 5106 + }, + { + "epoch": 1.8188780053428317, + "grad_norm": 0.46880555152893066, + "learning_rate": 1e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.7864478379487991, + "num_tokens": 1468897737.0, + "step": 5107 + }, + { + "epoch": 1.8192341941228851, + "grad_norm": 0.4846763014793396, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7582185715436935, + "num_tokens": 1469183713.0, + "step": 5108 + }, + { + "epoch": 1.8195903829029385, + "grad_norm": 0.43625107407569885, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7588730454444885, + "num_tokens": 1469517124.0, + "step": 5109 + }, + { + "epoch": 1.819946571682992, + "grad_norm": 0.49784839153289795, + "learning_rate": 1e-06, + "loss": 0.68, + "mean_token_accuracy": 0.782769963145256, + "num_tokens": 1469771333.0, + "step": 5110 + }, + { + "epoch": 1.8203027604630453, + "grad_norm": 0.45130088925361633, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7750994712114334, + "num_tokens": 1470080883.0, + "step": 5111 + }, + { + "epoch": 1.8206589492430987, + "grad_norm": 0.45399463176727295, + "learning_rate": 1e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7640803754329681, + "num_tokens": 1470397884.0, + "step": 5112 + }, + { + "epoch": 1.8210151380231523, + "grad_norm": 0.49962159991264343, + "learning_rate": 1e-06, + "loss": 0.7582, + "mean_token_accuracy": 0.7666546255350113, + "num_tokens": 1470672172.0, + "step": 5113 + }, + { + "epoch": 1.8213713268032057, + "grad_norm": 0.5194562077522278, + "learning_rate": 1e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7478182464838028, + "num_tokens": 1470937788.0, + "step": 5114 + }, + { + "epoch": 1.8217275155832593, + "grad_norm": 0.5172778367996216, + "learning_rate": 1e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.7693514227867126, + "num_tokens": 1471208311.0, + "step": 5115 + }, + { + "epoch": 1.8220837043633127, + "grad_norm": 0.47845762968063354, + "learning_rate": 1e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.7774382531642914, + "num_tokens": 1471505846.0, + "step": 5116 + }, + { + "epoch": 1.822439893143366, + "grad_norm": 0.48953935503959656, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7430529743432999, + "num_tokens": 1471783744.0, + "step": 5117 + }, + { + "epoch": 1.8227960819234195, + "grad_norm": 0.4750378131866455, + "learning_rate": 1e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7719602584838867, + "num_tokens": 1472056164.0, + "step": 5118 + }, + { + "epoch": 1.8231522707034729, + "grad_norm": 0.4589380919933319, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7678457498550415, + "num_tokens": 1472352984.0, + "step": 5119 + }, + { + "epoch": 1.8235084594835262, + "grad_norm": 0.4707253873348236, + "learning_rate": 1e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7558976113796234, + "num_tokens": 1472643291.0, + "step": 5120 + }, + { + "epoch": 1.8238646482635796, + "grad_norm": 0.48937246203422546, + "learning_rate": 1e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7661200761795044, + "num_tokens": 1472955503.0, + "step": 5121 + }, + { + "epoch": 1.824220837043633, + "grad_norm": 0.5567126870155334, + "learning_rate": 1e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7617095857858658, + "num_tokens": 1473212498.0, + "step": 5122 + }, + { + "epoch": 1.8245770258236864, + "grad_norm": 0.5197310447692871, + "learning_rate": 1e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.767898678779602, + "num_tokens": 1473463186.0, + "step": 5123 + }, + { + "epoch": 1.82493321460374, + "grad_norm": 0.48168548941612244, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7572944462299347, + "num_tokens": 1473738484.0, + "step": 5124 + }, + { + "epoch": 1.8252894033837934, + "grad_norm": 0.44523319602012634, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.753119632601738, + "num_tokens": 1474007513.0, + "step": 5125 + }, + { + "epoch": 1.8256455921638468, + "grad_norm": 0.47598955035209656, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.762916162610054, + "num_tokens": 1474302038.0, + "step": 5126 + }, + { + "epoch": 1.8260017809439004, + "grad_norm": 0.4558809697628021, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7692268788814545, + "num_tokens": 1474610449.0, + "step": 5127 + }, + { + "epoch": 1.8263579697239538, + "grad_norm": 0.535872220993042, + "learning_rate": 1e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.7653715163469315, + "num_tokens": 1474848692.0, + "step": 5128 + }, + { + "epoch": 1.8267141585040072, + "grad_norm": 0.5332196950912476, + "learning_rate": 1e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7659872323274612, + "num_tokens": 1475111950.0, + "step": 5129 + }, + { + "epoch": 1.8270703472840606, + "grad_norm": 0.42492204904556274, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7676081955432892, + "num_tokens": 1475406623.0, + "step": 5130 + }, + { + "epoch": 1.827426536064114, + "grad_norm": 0.48248493671417236, + "learning_rate": 1e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7608515918254852, + "num_tokens": 1475674259.0, + "step": 5131 + }, + { + "epoch": 1.8277827248441674, + "grad_norm": 0.5095698237419128, + "learning_rate": 1e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7651045322418213, + "num_tokens": 1475932971.0, + "step": 5132 + }, + { + "epoch": 1.8281389136242208, + "grad_norm": 0.43933597207069397, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7579720914363861, + "num_tokens": 1476246286.0, + "step": 5133 + }, + { + "epoch": 1.8284951024042742, + "grad_norm": 0.5111088752746582, + "learning_rate": 1e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7562938928604126, + "num_tokens": 1476476942.0, + "step": 5134 + }, + { + "epoch": 1.8288512911843275, + "grad_norm": 0.47417208552360535, + "learning_rate": 1e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.7851603478193283, + "num_tokens": 1476751394.0, + "step": 5135 + }, + { + "epoch": 1.8292074799643812, + "grad_norm": 0.4776183068752289, + "learning_rate": 1e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7693273276090622, + "num_tokens": 1477030672.0, + "step": 5136 + }, + { + "epoch": 1.8295636687444345, + "grad_norm": 0.5021313428878784, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7336321324110031, + "num_tokens": 1477316871.0, + "step": 5137 + }, + { + "epoch": 1.829919857524488, + "grad_norm": 0.4677179157733917, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7609046548604965, + "num_tokens": 1477586466.0, + "step": 5138 + }, + { + "epoch": 1.8302760463045415, + "grad_norm": 0.42045098543167114, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7645993232727051, + "num_tokens": 1477909469.0, + "step": 5139 + }, + { + "epoch": 1.830632235084595, + "grad_norm": 0.4764466881752014, + "learning_rate": 1e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7626850306987762, + "num_tokens": 1478196991.0, + "step": 5140 + }, + { + "epoch": 1.8309884238646483, + "grad_norm": 0.4423510432243347, + "learning_rate": 1e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7640398740768433, + "num_tokens": 1478478130.0, + "step": 5141 + }, + { + "epoch": 1.8313446126447017, + "grad_norm": 0.44956594705581665, + "learning_rate": 1e-06, + "loss": 0.6932, + "mean_token_accuracy": 0.7770861983299255, + "num_tokens": 1478764128.0, + "step": 5142 + }, + { + "epoch": 1.831700801424755, + "grad_norm": 0.4642525911331177, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7675415873527527, + "num_tokens": 1479034692.0, + "step": 5143 + }, + { + "epoch": 1.8320569902048085, + "grad_norm": 0.4818415641784668, + "learning_rate": 1e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7638383507728577, + "num_tokens": 1479309891.0, + "step": 5144 + }, + { + "epoch": 1.832413178984862, + "grad_norm": 0.465962678194046, + "learning_rate": 1e-06, + "loss": 0.703, + "mean_token_accuracy": 0.769112765789032, + "num_tokens": 1479596737.0, + "step": 5145 + }, + { + "epoch": 1.8327693677649153, + "grad_norm": 0.47176656126976013, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7633879631757736, + "num_tokens": 1479901600.0, + "step": 5146 + }, + { + "epoch": 1.8331255565449687, + "grad_norm": 0.4594959020614624, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7665033936500549, + "num_tokens": 1480188056.0, + "step": 5147 + }, + { + "epoch": 1.8334817453250223, + "grad_norm": 0.4488251507282257, + "learning_rate": 1e-06, + "loss": 0.6946, + "mean_token_accuracy": 0.7772020548582077, + "num_tokens": 1480458055.0, + "step": 5148 + }, + { + "epoch": 1.8338379341050757, + "grad_norm": 0.4373477101325989, + "learning_rate": 1e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7783115804195404, + "num_tokens": 1480767384.0, + "step": 5149 + }, + { + "epoch": 1.834194122885129, + "grad_norm": 0.447809100151062, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.767595112323761, + "num_tokens": 1481050285.0, + "step": 5150 + }, + { + "epoch": 1.8345503116651827, + "grad_norm": 0.4372153878211975, + "learning_rate": 1e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7613889575004578, + "num_tokens": 1481369179.0, + "step": 5151 + }, + { + "epoch": 1.834906500445236, + "grad_norm": 0.4358801245689392, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7657589763402939, + "num_tokens": 1481678749.0, + "step": 5152 + }, + { + "epoch": 1.8352626892252895, + "grad_norm": 0.4234166443347931, + "learning_rate": 1e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7774751335382462, + "num_tokens": 1482003173.0, + "step": 5153 + }, + { + "epoch": 1.8356188780053428, + "grad_norm": 0.45598506927490234, + "learning_rate": 1e-06, + "loss": 0.6775, + "mean_token_accuracy": 0.7825696468353271, + "num_tokens": 1482276152.0, + "step": 5154 + }, + { + "epoch": 1.8359750667853962, + "grad_norm": 0.445860356092453, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7658897340297699, + "num_tokens": 1482549258.0, + "step": 5155 + }, + { + "epoch": 1.8363312555654496, + "grad_norm": 0.48128649592399597, + "learning_rate": 1e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.7805734574794769, + "num_tokens": 1482832228.0, + "step": 5156 + }, + { + "epoch": 1.836687444345503, + "grad_norm": 0.4367307126522064, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7590111196041107, + "num_tokens": 1483129243.0, + "step": 5157 + }, + { + "epoch": 1.8370436331255564, + "grad_norm": 0.4467642307281494, + "learning_rate": 1e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7717863321304321, + "num_tokens": 1483417466.0, + "step": 5158 + }, + { + "epoch": 1.83739982190561, + "grad_norm": 0.5047509074211121, + "learning_rate": 1e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7542275935411453, + "num_tokens": 1483683515.0, + "step": 5159 + }, + { + "epoch": 1.8377560106856634, + "grad_norm": 0.482514351606369, + "learning_rate": 1e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7582865953445435, + "num_tokens": 1483969863.0, + "step": 5160 + }, + { + "epoch": 1.8381121994657168, + "grad_norm": 0.47937509417533875, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7710074484348297, + "num_tokens": 1484236750.0, + "step": 5161 + }, + { + "epoch": 1.8384683882457704, + "grad_norm": 0.4675256013870239, + "learning_rate": 1e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7783832848072052, + "num_tokens": 1484507316.0, + "step": 5162 + }, + { + "epoch": 1.8388245770258238, + "grad_norm": 0.516897439956665, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7549022138118744, + "num_tokens": 1484769514.0, + "step": 5163 + }, + { + "epoch": 1.8391807658058772, + "grad_norm": 0.4707116186618805, + "learning_rate": 1e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7742590457201004, + "num_tokens": 1485042716.0, + "step": 5164 + }, + { + "epoch": 1.8395369545859306, + "grad_norm": 0.5013059377670288, + "learning_rate": 1e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.7740169912576675, + "num_tokens": 1485307016.0, + "step": 5165 + }, + { + "epoch": 1.839893143365984, + "grad_norm": 0.49668088555336, + "learning_rate": 1e-06, + "loss": 0.7061, + "mean_token_accuracy": 0.7679041922092438, + "num_tokens": 1485542421.0, + "step": 5166 + }, + { + "epoch": 1.8402493321460374, + "grad_norm": 0.48013073205947876, + "learning_rate": 1e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7587864398956299, + "num_tokens": 1485829356.0, + "step": 5167 + }, + { + "epoch": 1.8406055209260908, + "grad_norm": 0.48448240756988525, + "learning_rate": 1e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7741394191980362, + "num_tokens": 1486115599.0, + "step": 5168 + }, + { + "epoch": 1.8409617097061441, + "grad_norm": 0.4411376714706421, + "learning_rate": 1e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.7738808393478394, + "num_tokens": 1486392072.0, + "step": 5169 + }, + { + "epoch": 1.8413178984861975, + "grad_norm": 0.4656524956226349, + "learning_rate": 1e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7562366724014282, + "num_tokens": 1486671501.0, + "step": 5170 + }, + { + "epoch": 1.8416740872662511, + "grad_norm": 0.5041651129722595, + "learning_rate": 1e-06, + "loss": 0.6809, + "mean_token_accuracy": 0.7742714583873749, + "num_tokens": 1486933452.0, + "step": 5171 + }, + { + "epoch": 1.8420302760463045, + "grad_norm": 0.4437294602394104, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7562605887651443, + "num_tokens": 1487228530.0, + "step": 5172 + }, + { + "epoch": 1.842386464826358, + "grad_norm": 0.4564487338066101, + "learning_rate": 1e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7702260911464691, + "num_tokens": 1487520551.0, + "step": 5173 + }, + { + "epoch": 1.8427426536064115, + "grad_norm": 0.45337986946105957, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.770092323422432, + "num_tokens": 1487816370.0, + "step": 5174 + }, + { + "epoch": 1.843098842386465, + "grad_norm": 0.48177528381347656, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7610721290111542, + "num_tokens": 1488095086.0, + "step": 5175 + }, + { + "epoch": 1.8434550311665183, + "grad_norm": 0.44149553775787354, + "learning_rate": 1e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7454878985881805, + "num_tokens": 1488375599.0, + "step": 5176 + }, + { + "epoch": 1.8438112199465717, + "grad_norm": 0.473481148481369, + "learning_rate": 1e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7751776874065399, + "num_tokens": 1488665653.0, + "step": 5177 + }, + { + "epoch": 1.844167408726625, + "grad_norm": 0.47945451736450195, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7374163269996643, + "num_tokens": 1488932346.0, + "step": 5178 + }, + { + "epoch": 1.8445235975066785, + "grad_norm": 0.4615709185600281, + "learning_rate": 1e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7712505757808685, + "num_tokens": 1489244533.0, + "step": 5179 + }, + { + "epoch": 1.8448797862867319, + "grad_norm": 0.48808741569519043, + "learning_rate": 1e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7484977394342422, + "num_tokens": 1489521678.0, + "step": 5180 + }, + { + "epoch": 1.8452359750667853, + "grad_norm": 0.4438052475452423, + "learning_rate": 1e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.7758512645959854, + "num_tokens": 1489833863.0, + "step": 5181 + }, + { + "epoch": 1.8455921638468387, + "grad_norm": 0.45862624049186707, + "learning_rate": 1e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.752197653055191, + "num_tokens": 1490149817.0, + "step": 5182 + }, + { + "epoch": 1.8459483526268923, + "grad_norm": 0.46900737285614014, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7521743029356003, + "num_tokens": 1490449756.0, + "step": 5183 + }, + { + "epoch": 1.8463045414069457, + "grad_norm": 0.4443691670894623, + "learning_rate": 1e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.759577676653862, + "num_tokens": 1490746807.0, + "step": 5184 + }, + { + "epoch": 1.846660730186999, + "grad_norm": 0.4542902112007141, + "learning_rate": 1e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.781671866774559, + "num_tokens": 1491013277.0, + "step": 5185 + }, + { + "epoch": 1.8470169189670527, + "grad_norm": 0.4695200026035309, + "learning_rate": 1e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.768376812338829, + "num_tokens": 1491304287.0, + "step": 5186 + }, + { + "epoch": 1.847373107747106, + "grad_norm": 0.45310625433921814, + "learning_rate": 1e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.754600316286087, + "num_tokens": 1491595176.0, + "step": 5187 + }, + { + "epoch": 1.8477292965271594, + "grad_norm": 0.4922456443309784, + "learning_rate": 1e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7669303268194199, + "num_tokens": 1491842962.0, + "step": 5188 + }, + { + "epoch": 1.8480854853072128, + "grad_norm": 0.48252612352371216, + "learning_rate": 1e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7684705853462219, + "num_tokens": 1492118796.0, + "step": 5189 + }, + { + "epoch": 1.8484416740872662, + "grad_norm": 0.4593087434768677, + "learning_rate": 1e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7633878886699677, + "num_tokens": 1492413457.0, + "step": 5190 + }, + { + "epoch": 1.8487978628673196, + "grad_norm": 0.4584794044494629, + "learning_rate": 1e-06, + "loss": 0.661, + "mean_token_accuracy": 0.7843388020992279, + "num_tokens": 1492701945.0, + "step": 5191 + }, + { + "epoch": 1.849154051647373, + "grad_norm": 0.4097956120967865, + "learning_rate": 1e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.7805692404508591, + "num_tokens": 1493035643.0, + "step": 5192 + }, + { + "epoch": 1.8495102404274264, + "grad_norm": 0.46364179253578186, + "learning_rate": 1e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7682445794343948, + "num_tokens": 1493336647.0, + "step": 5193 + }, + { + "epoch": 1.84986642920748, + "grad_norm": 0.4852527379989624, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7552558332681656, + "num_tokens": 1493600669.0, + "step": 5194 + }, + { + "epoch": 1.8502226179875334, + "grad_norm": 0.47891339659690857, + "learning_rate": 1e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7543832361698151, + "num_tokens": 1493875162.0, + "step": 5195 + }, + { + "epoch": 1.8505788067675868, + "grad_norm": 0.499838650226593, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7718590050935745, + "num_tokens": 1494150091.0, + "step": 5196 + }, + { + "epoch": 1.8509349955476404, + "grad_norm": 0.46898800134658813, + "learning_rate": 1e-06, + "loss": 0.6946, + "mean_token_accuracy": 0.775054082274437, + "num_tokens": 1494481348.0, + "step": 5197 + }, + { + "epoch": 1.8512911843276938, + "grad_norm": 0.49681156873703003, + "learning_rate": 1e-06, + "loss": 0.6512, + "mean_token_accuracy": 0.7826218008995056, + "num_tokens": 1494778272.0, + "step": 5198 + }, + { + "epoch": 1.8516473731077472, + "grad_norm": 0.43717506527900696, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7699942737817764, + "num_tokens": 1495083677.0, + "step": 5199 + }, + { + "epoch": 1.8520035618878006, + "grad_norm": 0.49536943435668945, + "learning_rate": 1e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7680865824222565, + "num_tokens": 1495408275.0, + "step": 5200 + }, + { + "epoch": 1.852359750667854, + "grad_norm": 0.5334924459457397, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7526366263628006, + "num_tokens": 1495676064.0, + "step": 5201 + }, + { + "epoch": 1.8527159394479074, + "grad_norm": 0.48400306701660156, + "learning_rate": 1e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7603537291288376, + "num_tokens": 1495935438.0, + "step": 5202 + }, + { + "epoch": 1.8530721282279607, + "grad_norm": 0.47248148918151855, + "learning_rate": 1e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7541520148515701, + "num_tokens": 1496215043.0, + "step": 5203 + }, + { + "epoch": 1.8534283170080141, + "grad_norm": 0.46254676580429077, + "learning_rate": 1e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7528046667575836, + "num_tokens": 1496509303.0, + "step": 5204 + }, + { + "epoch": 1.8537845057880675, + "grad_norm": 0.5028370022773743, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7675038129091263, + "num_tokens": 1496792191.0, + "step": 5205 + }, + { + "epoch": 1.8541406945681211, + "grad_norm": 0.4530744254589081, + "learning_rate": 1e-06, + "loss": 0.6592, + "mean_token_accuracy": 0.7830066829919815, + "num_tokens": 1497086282.0, + "step": 5206 + }, + { + "epoch": 1.8544968833481745, + "grad_norm": 0.4514278471469879, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7662804573774338, + "num_tokens": 1497415060.0, + "step": 5207 + }, + { + "epoch": 1.854853072128228, + "grad_norm": 0.4635753035545349, + "learning_rate": 1e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7590915262699127, + "num_tokens": 1497730327.0, + "step": 5208 + }, + { + "epoch": 1.8552092609082815, + "grad_norm": 0.522206723690033, + "learning_rate": 1e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.7536329329013824, + "num_tokens": 1497990489.0, + "step": 5209 + }, + { + "epoch": 1.855565449688335, + "grad_norm": 0.4875999689102173, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7540510892868042, + "num_tokens": 1498290498.0, + "step": 5210 + }, + { + "epoch": 1.8559216384683883, + "grad_norm": 0.5036157965660095, + "learning_rate": 1e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7619275748729706, + "num_tokens": 1498589799.0, + "step": 5211 + }, + { + "epoch": 1.8562778272484417, + "grad_norm": 0.4948749244213104, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7699586600065231, + "num_tokens": 1498870679.0, + "step": 5212 + }, + { + "epoch": 1.856634016028495, + "grad_norm": 0.4796941876411438, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.750460296869278, + "num_tokens": 1499153996.0, + "step": 5213 + }, + { + "epoch": 1.8569902048085485, + "grad_norm": 0.45237812399864197, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7663641571998596, + "num_tokens": 1499444898.0, + "step": 5214 + }, + { + "epoch": 1.8573463935886019, + "grad_norm": 0.5075642466545105, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7594647854566574, + "num_tokens": 1499710557.0, + "step": 5215 + }, + { + "epoch": 1.8577025823686553, + "grad_norm": 0.49145370721817017, + "learning_rate": 1e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7564000934362411, + "num_tokens": 1500016663.0, + "step": 5216 + }, + { + "epoch": 1.8580587711487087, + "grad_norm": 0.517703115940094, + "learning_rate": 1e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7627497762441635, + "num_tokens": 1500282065.0, + "step": 5217 + }, + { + "epoch": 1.8584149599287623, + "grad_norm": 0.4717361629009247, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7625328451395035, + "num_tokens": 1500559537.0, + "step": 5218 + }, + { + "epoch": 1.8587711487088157, + "grad_norm": 0.4382922053337097, + "learning_rate": 1e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7751183658838272, + "num_tokens": 1500883207.0, + "step": 5219 + }, + { + "epoch": 1.859127337488869, + "grad_norm": 0.42201653122901917, + "learning_rate": 1e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.783586397767067, + "num_tokens": 1501203451.0, + "step": 5220 + }, + { + "epoch": 1.8594835262689227, + "grad_norm": 0.459049254655838, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.765157625079155, + "num_tokens": 1501490151.0, + "step": 5221 + }, + { + "epoch": 1.859839715048976, + "grad_norm": 0.4781498908996582, + "learning_rate": 1e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7429080605506897, + "num_tokens": 1501780172.0, + "step": 5222 + }, + { + "epoch": 1.8601959038290294, + "grad_norm": 0.5439484119415283, + "learning_rate": 1e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7525749057531357, + "num_tokens": 1502037184.0, + "step": 5223 + }, + { + "epoch": 1.8605520926090828, + "grad_norm": 0.4804847240447998, + "learning_rate": 1e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7756088674068451, + "num_tokens": 1502312647.0, + "step": 5224 + }, + { + "epoch": 1.8609082813891362, + "grad_norm": 0.45570600032806396, + "learning_rate": 1e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7532394975423813, + "num_tokens": 1502593780.0, + "step": 5225 + }, + { + "epoch": 1.8612644701691896, + "grad_norm": 0.46862924098968506, + "learning_rate": 1e-06, + "loss": 0.6309, + "mean_token_accuracy": 0.7932935506105423, + "num_tokens": 1502871685.0, + "step": 5226 + }, + { + "epoch": 1.861620658949243, + "grad_norm": 0.4635288715362549, + "learning_rate": 1e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.7865374833345413, + "num_tokens": 1503146935.0, + "step": 5227 + }, + { + "epoch": 1.8619768477292964, + "grad_norm": 0.4512139856815338, + "learning_rate": 1e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7720954120159149, + "num_tokens": 1503465018.0, + "step": 5228 + }, + { + "epoch": 1.86233303650935, + "grad_norm": 0.46622222661972046, + "learning_rate": 1e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7615202367305756, + "num_tokens": 1503776686.0, + "step": 5229 + }, + { + "epoch": 1.8626892252894034, + "grad_norm": 0.48247966170310974, + "learning_rate": 1e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7721586227416992, + "num_tokens": 1504030676.0, + "step": 5230 + }, + { + "epoch": 1.8630454140694568, + "grad_norm": 0.49813416600227356, + "learning_rate": 1e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7703858762979507, + "num_tokens": 1504309358.0, + "step": 5231 + }, + { + "epoch": 1.8634016028495104, + "grad_norm": 0.4830406904220581, + "learning_rate": 1e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.752678245306015, + "num_tokens": 1504611768.0, + "step": 5232 + }, + { + "epoch": 1.8637577916295638, + "grad_norm": 0.4403323531150818, + "learning_rate": 1e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7722730338573456, + "num_tokens": 1504926393.0, + "step": 5233 + }, + { + "epoch": 1.8641139804096172, + "grad_norm": 0.534511923789978, + "learning_rate": 1e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7714181393384933, + "num_tokens": 1505186292.0, + "step": 5234 + }, + { + "epoch": 1.8644701691896706, + "grad_norm": 0.41107240319252014, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7663711607456207, + "num_tokens": 1505480670.0, + "step": 5235 + }, + { + "epoch": 1.864826357969724, + "grad_norm": 0.5058571696281433, + "learning_rate": 1e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7555152624845505, + "num_tokens": 1505776250.0, + "step": 5236 + }, + { + "epoch": 1.8651825467497773, + "grad_norm": 0.4940919280052185, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7521576583385468, + "num_tokens": 1506024654.0, + "step": 5237 + }, + { + "epoch": 1.8655387355298307, + "grad_norm": 0.5171546936035156, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7602429986000061, + "num_tokens": 1506316084.0, + "step": 5238 + }, + { + "epoch": 1.8658949243098841, + "grad_norm": 0.4597989320755005, + "learning_rate": 1e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7748600095510483, + "num_tokens": 1506611165.0, + "step": 5239 + }, + { + "epoch": 1.8662511130899375, + "grad_norm": 0.4930374324321747, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7702705413103104, + "num_tokens": 1506892085.0, + "step": 5240 + }, + { + "epoch": 1.8666073018699911, + "grad_norm": 0.5084826350212097, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7575867027044296, + "num_tokens": 1507154774.0, + "step": 5241 + }, + { + "epoch": 1.8669634906500445, + "grad_norm": 0.47890704870224, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7567248791456223, + "num_tokens": 1507439831.0, + "step": 5242 + }, + { + "epoch": 1.867319679430098, + "grad_norm": 0.4485117793083191, + "learning_rate": 1e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7641533315181732, + "num_tokens": 1507759969.0, + "step": 5243 + }, + { + "epoch": 1.8676758682101515, + "grad_norm": 0.4584687054157257, + "learning_rate": 1e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7652546763420105, + "num_tokens": 1508054199.0, + "step": 5244 + }, + { + "epoch": 1.868032056990205, + "grad_norm": 0.4645577073097229, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7496057152748108, + "num_tokens": 1508312071.0, + "step": 5245 + }, + { + "epoch": 1.8683882457702583, + "grad_norm": 0.4651801884174347, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7674933969974518, + "num_tokens": 1508604119.0, + "step": 5246 + }, + { + "epoch": 1.8687444345503117, + "grad_norm": 0.43786826729774475, + "learning_rate": 1e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7731348425149918, + "num_tokens": 1508939004.0, + "step": 5247 + }, + { + "epoch": 1.869100623330365, + "grad_norm": 0.4906632900238037, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7729325741529465, + "num_tokens": 1509198333.0, + "step": 5248 + }, + { + "epoch": 1.8694568121104185, + "grad_norm": 0.5010480284690857, + "learning_rate": 1e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7616977393627167, + "num_tokens": 1509458357.0, + "step": 5249 + }, + { + "epoch": 1.8698130008904719, + "grad_norm": 0.45400187373161316, + "learning_rate": 1e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7711715996265411, + "num_tokens": 1509733694.0, + "step": 5250 + }, + { + "epoch": 1.8701691896705253, + "grad_norm": 0.44571852684020996, + "learning_rate": 1e-06, + "loss": 0.6646, + "mean_token_accuracy": 0.7847467213869095, + "num_tokens": 1510019870.0, + "step": 5251 + }, + { + "epoch": 1.8705253784505786, + "grad_norm": 0.47275638580322266, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7637888044118881, + "num_tokens": 1510277352.0, + "step": 5252 + }, + { + "epoch": 1.8708815672306323, + "grad_norm": 0.440930038690567, + "learning_rate": 1e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.7838753908872604, + "num_tokens": 1510554249.0, + "step": 5253 + }, + { + "epoch": 1.8712377560106856, + "grad_norm": 0.4802071750164032, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7478732019662857, + "num_tokens": 1510821704.0, + "step": 5254 + }, + { + "epoch": 1.871593944790739, + "grad_norm": 0.47415947914123535, + "learning_rate": 1e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7744350135326385, + "num_tokens": 1511096068.0, + "step": 5255 + }, + { + "epoch": 1.8719501335707927, + "grad_norm": 0.44562584161758423, + "learning_rate": 1e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.7745707035064697, + "num_tokens": 1511407532.0, + "step": 5256 + }, + { + "epoch": 1.872306322350846, + "grad_norm": 0.4685130715370178, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7558012902736664, + "num_tokens": 1511685605.0, + "step": 5257 + }, + { + "epoch": 1.8726625111308994, + "grad_norm": 0.4350995123386383, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7424681931734085, + "num_tokens": 1511963020.0, + "step": 5258 + }, + { + "epoch": 1.8730186999109528, + "grad_norm": 0.5175138711929321, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7373224943876266, + "num_tokens": 1512231165.0, + "step": 5259 + }, + { + "epoch": 1.8733748886910062, + "grad_norm": 0.5037145018577576, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7651385366916656, + "num_tokens": 1512509924.0, + "step": 5260 + }, + { + "epoch": 1.8737310774710596, + "grad_norm": 0.43617039918899536, + "learning_rate": 1e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7652343958616257, + "num_tokens": 1512795004.0, + "step": 5261 + }, + { + "epoch": 1.874087266251113, + "grad_norm": 0.44461938738822937, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7582945674657822, + "num_tokens": 1513139061.0, + "step": 5262 + }, + { + "epoch": 1.8744434550311664, + "grad_norm": 0.5340354442596436, + "learning_rate": 1e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7537637501955032, + "num_tokens": 1513381108.0, + "step": 5263 + }, + { + "epoch": 1.87479964381122, + "grad_norm": 0.46314114332199097, + "learning_rate": 1e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7543764859437943, + "num_tokens": 1513652966.0, + "step": 5264 + }, + { + "epoch": 1.8751558325912734, + "grad_norm": 0.4313160181045532, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7656840234994888, + "num_tokens": 1513966978.0, + "step": 5265 + }, + { + "epoch": 1.8755120213713268, + "grad_norm": 0.43410760164260864, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7578787952661514, + "num_tokens": 1514273211.0, + "step": 5266 + }, + { + "epoch": 1.8758682101513804, + "grad_norm": 0.4667041599750519, + "learning_rate": 1e-06, + "loss": 0.6531, + "mean_token_accuracy": 0.7916566133499146, + "num_tokens": 1514568394.0, + "step": 5267 + }, + { + "epoch": 1.8762243989314338, + "grad_norm": 0.47097742557525635, + "learning_rate": 1e-06, + "loss": 0.6606, + "mean_token_accuracy": 0.7845284342765808, + "num_tokens": 1514875450.0, + "step": 5268 + }, + { + "epoch": 1.8765805877114872, + "grad_norm": 0.4270552396774292, + "learning_rate": 1e-06, + "loss": 0.6762, + "mean_token_accuracy": 0.781698927283287, + "num_tokens": 1515201875.0, + "step": 5269 + }, + { + "epoch": 1.8769367764915406, + "grad_norm": 0.4831985533237457, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7610168308019638, + "num_tokens": 1515506853.0, + "step": 5270 + }, + { + "epoch": 1.877292965271594, + "grad_norm": 0.5008886456489563, + "learning_rate": 1e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7709974646568298, + "num_tokens": 1515781850.0, + "step": 5271 + }, + { + "epoch": 1.8776491540516473, + "grad_norm": 0.5031018853187561, + "learning_rate": 1e-06, + "loss": 0.6625, + "mean_token_accuracy": 0.7868773341178894, + "num_tokens": 1516028201.0, + "step": 5272 + }, + { + "epoch": 1.8780053428317007, + "grad_norm": 0.4858200252056122, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7682588547468185, + "num_tokens": 1516288976.0, + "step": 5273 + }, + { + "epoch": 1.8783615316117541, + "grad_norm": 0.483310341835022, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7631869912147522, + "num_tokens": 1516548088.0, + "step": 5274 + }, + { + "epoch": 1.8787177203918075, + "grad_norm": 0.45695510506629944, + "learning_rate": 1e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7769143283367157, + "num_tokens": 1516874873.0, + "step": 5275 + }, + { + "epoch": 1.8790739091718611, + "grad_norm": 0.5023783445358276, + "learning_rate": 1e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.7729906141757965, + "num_tokens": 1517141210.0, + "step": 5276 + }, + { + "epoch": 1.8794300979519145, + "grad_norm": 0.5357189774513245, + "learning_rate": 1e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.760612964630127, + "num_tokens": 1517376067.0, + "step": 5277 + }, + { + "epoch": 1.879786286731968, + "grad_norm": 0.44658195972442627, + "learning_rate": 1e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7619483917951584, + "num_tokens": 1517662363.0, + "step": 5278 + }, + { + "epoch": 1.8801424755120215, + "grad_norm": 0.4758857488632202, + "learning_rate": 1e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7604558020830154, + "num_tokens": 1517955908.0, + "step": 5279 + }, + { + "epoch": 1.880498664292075, + "grad_norm": 0.4499090015888214, + "learning_rate": 1e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7587960213422775, + "num_tokens": 1518288856.0, + "step": 5280 + }, + { + "epoch": 1.8808548530721283, + "grad_norm": 0.5204932689666748, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.745443806052208, + "num_tokens": 1518525956.0, + "step": 5281 + }, + { + "epoch": 1.8812110418521817, + "grad_norm": 0.5397195219993591, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7682880610227585, + "num_tokens": 1518791352.0, + "step": 5282 + }, + { + "epoch": 1.881567230632235, + "grad_norm": 0.4481283724308014, + "learning_rate": 1e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7616339176893234, + "num_tokens": 1519092945.0, + "step": 5283 + }, + { + "epoch": 1.8819234194122885, + "grad_norm": 0.4149532914161682, + "learning_rate": 1e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7520259767770767, + "num_tokens": 1519382886.0, + "step": 5284 + }, + { + "epoch": 1.8822796081923419, + "grad_norm": 0.4460362493991852, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7635058164596558, + "num_tokens": 1519685116.0, + "step": 5285 + }, + { + "epoch": 1.8826357969723952, + "grad_norm": 0.49059873819351196, + "learning_rate": 1e-06, + "loss": 0.6719, + "mean_token_accuracy": 0.7811765521764755, + "num_tokens": 1519994257.0, + "step": 5286 + }, + { + "epoch": 1.8829919857524486, + "grad_norm": 0.4878509044647217, + "learning_rate": 1e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7568959891796112, + "num_tokens": 1520297868.0, + "step": 5287 + }, + { + "epoch": 1.8833481745325023, + "grad_norm": 0.44812124967575073, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7512286156415939, + "num_tokens": 1520590188.0, + "step": 5288 + }, + { + "epoch": 1.8837043633125556, + "grad_norm": 0.46125662326812744, + "learning_rate": 1e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7805059105157852, + "num_tokens": 1520862454.0, + "step": 5289 + }, + { + "epoch": 1.884060552092609, + "grad_norm": 0.4634579122066498, + "learning_rate": 1e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7803802937269211, + "num_tokens": 1521145947.0, + "step": 5290 + }, + { + "epoch": 1.8844167408726626, + "grad_norm": 0.5378016233444214, + "learning_rate": 1e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7736194282770157, + "num_tokens": 1521433274.0, + "step": 5291 + }, + { + "epoch": 1.884772929652716, + "grad_norm": 0.47791191935539246, + "learning_rate": 1e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7572124004364014, + "num_tokens": 1521711742.0, + "step": 5292 + }, + { + "epoch": 1.8851291184327694, + "grad_norm": 0.4633397161960602, + "learning_rate": 1e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7680168151855469, + "num_tokens": 1522030846.0, + "step": 5293 + }, + { + "epoch": 1.8854853072128228, + "grad_norm": 0.497982382774353, + "learning_rate": 1e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7621263861656189, + "num_tokens": 1522307953.0, + "step": 5294 + }, + { + "epoch": 1.8858414959928762, + "grad_norm": 0.47625139355659485, + "learning_rate": 1e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.7820575833320618, + "num_tokens": 1522587525.0, + "step": 5295 + }, + { + "epoch": 1.8861976847729296, + "grad_norm": 0.4550572633743286, + "learning_rate": 1e-06, + "loss": 0.6873, + "mean_token_accuracy": 0.7793886959552765, + "num_tokens": 1522855840.0, + "step": 5296 + }, + { + "epoch": 1.886553873552983, + "grad_norm": 0.4842345714569092, + "learning_rate": 1e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7731746882200241, + "num_tokens": 1523155506.0, + "step": 5297 + }, + { + "epoch": 1.8869100623330364, + "grad_norm": 0.45081838965415955, + "learning_rate": 1e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7776083648204803, + "num_tokens": 1523432701.0, + "step": 5298 + }, + { + "epoch": 1.88726625111309, + "grad_norm": 0.3934464454650879, + "learning_rate": 1e-06, + "loss": 0.6674, + "mean_token_accuracy": 0.784832239151001, + "num_tokens": 1523773433.0, + "step": 5299 + }, + { + "epoch": 1.8876224398931434, + "grad_norm": 0.4577193260192871, + "learning_rate": 1e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7761583924293518, + "num_tokens": 1524071665.0, + "step": 5300 + }, + { + "epoch": 1.8879786286731968, + "grad_norm": 0.48844099044799805, + "learning_rate": 1e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.7802203446626663, + "num_tokens": 1524365305.0, + "step": 5301 + }, + { + "epoch": 1.8883348174532504, + "grad_norm": 0.5420563220977783, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7673725783824921, + "num_tokens": 1524632517.0, + "step": 5302 + }, + { + "epoch": 1.8886910062333038, + "grad_norm": 0.5271345376968384, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7610113471746445, + "num_tokens": 1524899427.0, + "step": 5303 + }, + { + "epoch": 1.8890471950133572, + "grad_norm": 0.4849953353404999, + "learning_rate": 1e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.7836999446153641, + "num_tokens": 1525195374.0, + "step": 5304 + }, + { + "epoch": 1.8894033837934106, + "grad_norm": 0.4975948631763458, + "learning_rate": 1e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7548431605100632, + "num_tokens": 1525463646.0, + "step": 5305 + }, + { + "epoch": 1.889759572573464, + "grad_norm": 0.4611956477165222, + "learning_rate": 1e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7749247252941132, + "num_tokens": 1525771136.0, + "step": 5306 + }, + { + "epoch": 1.8901157613535173, + "grad_norm": 0.4703485667705536, + "learning_rate": 1e-06, + "loss": 0.722, + "mean_token_accuracy": 0.770317554473877, + "num_tokens": 1526066679.0, + "step": 5307 + }, + { + "epoch": 1.8904719501335707, + "grad_norm": 0.4441595673561096, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7637810707092285, + "num_tokens": 1526357697.0, + "step": 5308 + }, + { + "epoch": 1.8908281389136241, + "grad_norm": 0.4434840977191925, + "learning_rate": 1e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.757598489522934, + "num_tokens": 1526630541.0, + "step": 5309 + }, + { + "epoch": 1.8911843276936775, + "grad_norm": 0.4681874215602875, + "learning_rate": 1e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.773588627576828, + "num_tokens": 1526908421.0, + "step": 5310 + }, + { + "epoch": 1.8915405164737311, + "grad_norm": 0.39785119891166687, + "learning_rate": 1e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7706582695245743, + "num_tokens": 1527224005.0, + "step": 5311 + }, + { + "epoch": 1.8918967052537845, + "grad_norm": 0.4388566315174103, + "learning_rate": 1e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7539176344871521, + "num_tokens": 1527511411.0, + "step": 5312 + }, + { + "epoch": 1.892252894033838, + "grad_norm": 0.4801919460296631, + "learning_rate": 1e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.7801326960325241, + "num_tokens": 1527787970.0, + "step": 5313 + }, + { + "epoch": 1.8926090828138915, + "grad_norm": 0.48084577918052673, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7488308399915695, + "num_tokens": 1528067826.0, + "step": 5314 + }, + { + "epoch": 1.892965271593945, + "grad_norm": 0.46188706159591675, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7672928720712662, + "num_tokens": 1528368282.0, + "step": 5315 + }, + { + "epoch": 1.8933214603739983, + "grad_norm": 0.46024832129478455, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7643974423408508, + "num_tokens": 1528680582.0, + "step": 5316 + }, + { + "epoch": 1.8936776491540517, + "grad_norm": 0.4754093289375305, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7665392905473709, + "num_tokens": 1528976201.0, + "step": 5317 + }, + { + "epoch": 1.894033837934105, + "grad_norm": 0.4689496159553528, + "learning_rate": 1e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7632807344198227, + "num_tokens": 1529267795.0, + "step": 5318 + }, + { + "epoch": 1.8943900267141585, + "grad_norm": 0.4239097237586975, + "learning_rate": 1e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.7864946126937866, + "num_tokens": 1529564366.0, + "step": 5319 + }, + { + "epoch": 1.8947462154942118, + "grad_norm": 0.5461118221282959, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7533271759748459, + "num_tokens": 1529823493.0, + "step": 5320 + }, + { + "epoch": 1.8951024042742652, + "grad_norm": 0.45862942934036255, + "learning_rate": 1e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7716815173625946, + "num_tokens": 1530112662.0, + "step": 5321 + }, + { + "epoch": 1.8954585930543186, + "grad_norm": 0.40521374344825745, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7692786604166031, + "num_tokens": 1530468984.0, + "step": 5322 + }, + { + "epoch": 1.8958147818343722, + "grad_norm": 0.45781224966049194, + "learning_rate": 1e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7781451195478439, + "num_tokens": 1530783880.0, + "step": 5323 + }, + { + "epoch": 1.8961709706144256, + "grad_norm": 0.48334014415740967, + "learning_rate": 1e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7790314853191376, + "num_tokens": 1531085872.0, + "step": 5324 + }, + { + "epoch": 1.896527159394479, + "grad_norm": 0.46913784742355347, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7555646896362305, + "num_tokens": 1531365754.0, + "step": 5325 + }, + { + "epoch": 1.8968833481745326, + "grad_norm": 0.46207791566848755, + "learning_rate": 1e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7625025659799576, + "num_tokens": 1531643807.0, + "step": 5326 + }, + { + "epoch": 1.897239536954586, + "grad_norm": 0.47642025351524353, + "learning_rate": 1e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7638672143220901, + "num_tokens": 1531948811.0, + "step": 5327 + }, + { + "epoch": 1.8975957257346394, + "grad_norm": 0.4387269914150238, + "learning_rate": 1e-06, + "loss": 0.6309, + "mean_token_accuracy": 0.7943932563066483, + "num_tokens": 1532270570.0, + "step": 5328 + }, + { + "epoch": 1.8979519145146928, + "grad_norm": 0.4642462134361267, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7637057304382324, + "num_tokens": 1532549244.0, + "step": 5329 + }, + { + "epoch": 1.8983081032947462, + "grad_norm": 0.4929651618003845, + "learning_rate": 1e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7690836936235428, + "num_tokens": 1532837130.0, + "step": 5330 + }, + { + "epoch": 1.8986642920747996, + "grad_norm": 0.4947149157524109, + "learning_rate": 1e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7604472190141678, + "num_tokens": 1533091563.0, + "step": 5331 + }, + { + "epoch": 1.899020480854853, + "grad_norm": 0.4851807951927185, + "learning_rate": 1e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7555614709854126, + "num_tokens": 1533380581.0, + "step": 5332 + }, + { + "epoch": 1.8993766696349064, + "grad_norm": 0.49276503920555115, + "learning_rate": 1e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.772733747959137, + "num_tokens": 1533663317.0, + "step": 5333 + }, + { + "epoch": 1.89973285841496, + "grad_norm": 0.5002153515815735, + "learning_rate": 1e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.75758296251297, + "num_tokens": 1533967066.0, + "step": 5334 + }, + { + "epoch": 1.9000890471950134, + "grad_norm": 0.45250433683395386, + "learning_rate": 1e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7755094170570374, + "num_tokens": 1534286326.0, + "step": 5335 + }, + { + "epoch": 1.9004452359750668, + "grad_norm": 0.48713719844818115, + "learning_rate": 1e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.776308000087738, + "num_tokens": 1534563816.0, + "step": 5336 + }, + { + "epoch": 1.9008014247551204, + "grad_norm": 0.5278892517089844, + "learning_rate": 1e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.7697538733482361, + "num_tokens": 1534833792.0, + "step": 5337 + }, + { + "epoch": 1.9011576135351738, + "grad_norm": 0.44916483759880066, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.766237199306488, + "num_tokens": 1535136094.0, + "step": 5338 + }, + { + "epoch": 1.9015138023152272, + "grad_norm": 0.4732840061187744, + "learning_rate": 1e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7660640925168991, + "num_tokens": 1535404315.0, + "step": 5339 + }, + { + "epoch": 1.9018699910952805, + "grad_norm": 0.5067861080169678, + "learning_rate": 1e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.7717893123626709, + "num_tokens": 1535657116.0, + "step": 5340 + }, + { + "epoch": 1.902226179875334, + "grad_norm": 0.46353140473365784, + "learning_rate": 1e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7573194056749344, + "num_tokens": 1535955049.0, + "step": 5341 + }, + { + "epoch": 1.9025823686553873, + "grad_norm": 0.4952629506587982, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7442856729030609, + "num_tokens": 1536211719.0, + "step": 5342 + }, + { + "epoch": 1.9029385574354407, + "grad_norm": 0.46782422065734863, + "learning_rate": 1e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.7806822210550308, + "num_tokens": 1536519122.0, + "step": 5343 + }, + { + "epoch": 1.903294746215494, + "grad_norm": 0.4638440012931824, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7516587674617767, + "num_tokens": 1536796223.0, + "step": 5344 + }, + { + "epoch": 1.9036509349955475, + "grad_norm": 0.4453972578048706, + "learning_rate": 1e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.7556599825620651, + "num_tokens": 1537067695.0, + "step": 5345 + }, + { + "epoch": 1.904007123775601, + "grad_norm": 0.4625958204269409, + "learning_rate": 1e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7655849456787109, + "num_tokens": 1537346250.0, + "step": 5346 + }, + { + "epoch": 1.9043633125556545, + "grad_norm": 0.46299174427986145, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7625988125801086, + "num_tokens": 1537608940.0, + "step": 5347 + }, + { + "epoch": 1.9047195013357079, + "grad_norm": 0.4231174886226654, + "learning_rate": 1e-06, + "loss": 0.6406, + "mean_token_accuracy": 0.7872016727924347, + "num_tokens": 1537907054.0, + "step": 5348 + }, + { + "epoch": 1.9050756901157615, + "grad_norm": 0.48046761751174927, + "learning_rate": 1e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7597083151340485, + "num_tokens": 1538207471.0, + "step": 5349 + }, + { + "epoch": 1.905431878895815, + "grad_norm": 0.45360133051872253, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7764212787151337, + "num_tokens": 1538519753.0, + "step": 5350 + }, + { + "epoch": 1.9057880676758683, + "grad_norm": 0.4849177300930023, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7640206664800644, + "num_tokens": 1538801788.0, + "step": 5351 + }, + { + "epoch": 1.9061442564559217, + "grad_norm": 0.4887656569480896, + "learning_rate": 1e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7579140812158585, + "num_tokens": 1539088648.0, + "step": 5352 + }, + { + "epoch": 1.906500445235975, + "grad_norm": 0.43578967452049255, + "learning_rate": 1e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7674206048250198, + "num_tokens": 1539418478.0, + "step": 5353 + }, + { + "epoch": 1.9068566340160285, + "grad_norm": 0.47997385263442993, + "learning_rate": 1e-06, + "loss": 0.7468, + "mean_token_accuracy": 0.7645538598299026, + "num_tokens": 1539666334.0, + "step": 5354 + }, + { + "epoch": 1.9072128227960818, + "grad_norm": 0.4651713967323303, + "learning_rate": 1e-06, + "loss": 0.653, + "mean_token_accuracy": 0.7865704447031021, + "num_tokens": 1539956433.0, + "step": 5355 + }, + { + "epoch": 1.9075690115761352, + "grad_norm": 0.4915142357349396, + "learning_rate": 1e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7711023092269897, + "num_tokens": 1540254141.0, + "step": 5356 + }, + { + "epoch": 1.9079252003561886, + "grad_norm": 0.4789584279060364, + "learning_rate": 1e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.7796117067337036, + "num_tokens": 1540560678.0, + "step": 5357 + }, + { + "epoch": 1.9082813891362422, + "grad_norm": 0.4761678874492645, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.770614966750145, + "num_tokens": 1540834968.0, + "step": 5358 + }, + { + "epoch": 1.9086375779162956, + "grad_norm": 0.5105981826782227, + "learning_rate": 1e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7703551054000854, + "num_tokens": 1541120809.0, + "step": 5359 + }, + { + "epoch": 1.908993766696349, + "grad_norm": 0.537405252456665, + "learning_rate": 1e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7696371823549271, + "num_tokens": 1541377155.0, + "step": 5360 + }, + { + "epoch": 1.9093499554764026, + "grad_norm": 0.47247016429901123, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7720305621623993, + "num_tokens": 1541641250.0, + "step": 5361 + }, + { + "epoch": 1.909706144256456, + "grad_norm": 0.45758941769599915, + "learning_rate": 1e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7731827795505524, + "num_tokens": 1541927880.0, + "step": 5362 + }, + { + "epoch": 1.9100623330365094, + "grad_norm": 0.41691258549690247, + "learning_rate": 1e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7623930722475052, + "num_tokens": 1542259725.0, + "step": 5363 + }, + { + "epoch": 1.9104185218165628, + "grad_norm": 0.5000154376029968, + "learning_rate": 1e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7745469808578491, + "num_tokens": 1542532433.0, + "step": 5364 + }, + { + "epoch": 1.9107747105966162, + "grad_norm": 0.4233979284763336, + "learning_rate": 1e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7748963832855225, + "num_tokens": 1542817419.0, + "step": 5365 + }, + { + "epoch": 1.9111308993766696, + "grad_norm": 0.48761776089668274, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7588950097560883, + "num_tokens": 1543099484.0, + "step": 5366 + }, + { + "epoch": 1.911487088156723, + "grad_norm": 0.47282180190086365, + "learning_rate": 1e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7673131376504898, + "num_tokens": 1543382283.0, + "step": 5367 + }, + { + "epoch": 1.9118432769367764, + "grad_norm": 0.49747148156166077, + "learning_rate": 1e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.764533594250679, + "num_tokens": 1543635655.0, + "step": 5368 + }, + { + "epoch": 1.91219946571683, + "grad_norm": 0.4834257960319519, + "learning_rate": 1e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.7884391248226166, + "num_tokens": 1543946361.0, + "step": 5369 + }, + { + "epoch": 1.9125556544968834, + "grad_norm": 0.5124927163124084, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7657999396324158, + "num_tokens": 1544209534.0, + "step": 5370 + }, + { + "epoch": 1.9129118432769368, + "grad_norm": 0.5260215401649475, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7560880184173584, + "num_tokens": 1544465713.0, + "step": 5371 + }, + { + "epoch": 1.9132680320569904, + "grad_norm": 0.504663348197937, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7584267556667328, + "num_tokens": 1544790528.0, + "step": 5372 + }, + { + "epoch": 1.9136242208370438, + "grad_norm": 0.4629478454589844, + "learning_rate": 1e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7679242640733719, + "num_tokens": 1545083595.0, + "step": 5373 + }, + { + "epoch": 1.9139804096170971, + "grad_norm": 0.5126828551292419, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7616516798734665, + "num_tokens": 1545367826.0, + "step": 5374 + }, + { + "epoch": 1.9143365983971505, + "grad_norm": 0.46055108308792114, + "learning_rate": 1e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7703090608119965, + "num_tokens": 1545663951.0, + "step": 5375 + }, + { + "epoch": 1.914692787177204, + "grad_norm": 0.5079358220100403, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7627973705530167, + "num_tokens": 1545935741.0, + "step": 5376 + }, + { + "epoch": 1.9150489759572573, + "grad_norm": 0.4966123402118683, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7527821958065033, + "num_tokens": 1546190370.0, + "step": 5377 + }, + { + "epoch": 1.9154051647373107, + "grad_norm": 0.45555755496025085, + "learning_rate": 1e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.7696651220321655, + "num_tokens": 1546488881.0, + "step": 5378 + }, + { + "epoch": 1.915761353517364, + "grad_norm": 0.4929571747779846, + "learning_rate": 1e-06, + "loss": 0.6712, + "mean_token_accuracy": 0.7827885448932648, + "num_tokens": 1546792654.0, + "step": 5379 + }, + { + "epoch": 1.9161175422974175, + "grad_norm": 0.5398513674736023, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7599792629480362, + "num_tokens": 1547029571.0, + "step": 5380 + }, + { + "epoch": 1.916473731077471, + "grad_norm": 0.464351624250412, + "learning_rate": 1e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7569766044616699, + "num_tokens": 1547322051.0, + "step": 5381 + }, + { + "epoch": 1.9168299198575245, + "grad_norm": 0.5113030076026917, + "learning_rate": 1e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7735249251127243, + "num_tokens": 1547606495.0, + "step": 5382 + }, + { + "epoch": 1.9171861086375779, + "grad_norm": 0.48390862345695496, + "learning_rate": 1e-06, + "loss": 0.6787, + "mean_token_accuracy": 0.7809348106384277, + "num_tokens": 1547923062.0, + "step": 5383 + }, + { + "epoch": 1.9175422974176315, + "grad_norm": 0.46969279646873474, + "learning_rate": 1e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7711350917816162, + "num_tokens": 1548221821.0, + "step": 5384 + }, + { + "epoch": 1.9178984861976849, + "grad_norm": 0.46501752734184265, + "learning_rate": 1e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7643966823816299, + "num_tokens": 1548514112.0, + "step": 5385 + }, + { + "epoch": 1.9182546749777383, + "grad_norm": 0.4904254972934723, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7552428990602493, + "num_tokens": 1548788951.0, + "step": 5386 + }, + { + "epoch": 1.9186108637577917, + "grad_norm": 0.49018633365631104, + "learning_rate": 1e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7509285807609558, + "num_tokens": 1549060887.0, + "step": 5387 + }, + { + "epoch": 1.918967052537845, + "grad_norm": 0.4979970455169678, + "learning_rate": 1e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7522920817136765, + "num_tokens": 1549349399.0, + "step": 5388 + }, + { + "epoch": 1.9193232413178984, + "grad_norm": 0.5426104068756104, + "learning_rate": 1e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7701967060565948, + "num_tokens": 1549592149.0, + "step": 5389 + }, + { + "epoch": 1.9196794300979518, + "grad_norm": 0.46147269010543823, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7716059684753418, + "num_tokens": 1549884960.0, + "step": 5390 + }, + { + "epoch": 1.9200356188780052, + "grad_norm": 0.4823215901851654, + "learning_rate": 1e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.7657538205385208, + "num_tokens": 1550169128.0, + "step": 5391 + }, + { + "epoch": 1.9203918076580586, + "grad_norm": 0.4877515137195587, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7616133391857147, + "num_tokens": 1550439915.0, + "step": 5392 + }, + { + "epoch": 1.9207479964381122, + "grad_norm": 0.4666764736175537, + "learning_rate": 1e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7711893022060394, + "num_tokens": 1550763375.0, + "step": 5393 + }, + { + "epoch": 1.9211041852181656, + "grad_norm": 0.413842111825943, + "learning_rate": 1e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7719584703445435, + "num_tokens": 1551077176.0, + "step": 5394 + }, + { + "epoch": 1.921460373998219, + "grad_norm": 0.46932467818260193, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7534738779067993, + "num_tokens": 1551342051.0, + "step": 5395 + }, + { + "epoch": 1.9218165627782726, + "grad_norm": 0.4803679585456848, + "learning_rate": 1e-06, + "loss": 0.6771, + "mean_token_accuracy": 0.7824470698833466, + "num_tokens": 1551623989.0, + "step": 5396 + }, + { + "epoch": 1.922172751558326, + "grad_norm": 0.4370204508304596, + "learning_rate": 1e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.7822945564985275, + "num_tokens": 1551919846.0, + "step": 5397 + }, + { + "epoch": 1.9225289403383794, + "grad_norm": 0.4686790406703949, + "learning_rate": 1e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7651993632316589, + "num_tokens": 1552198976.0, + "step": 5398 + }, + { + "epoch": 1.9228851291184328, + "grad_norm": 0.5013869404792786, + "learning_rate": 1e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7579113841056824, + "num_tokens": 1552451592.0, + "step": 5399 + }, + { + "epoch": 1.9232413178984862, + "grad_norm": 0.47958189249038696, + "learning_rate": 1e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7531742006540298, + "num_tokens": 1552780546.0, + "step": 5400 + }, + { + "epoch": 1.9235975066785396, + "grad_norm": 0.452753484249115, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7679703682661057, + "num_tokens": 1553096984.0, + "step": 5401 + }, + { + "epoch": 1.923953695458593, + "grad_norm": 0.45833510160446167, + "learning_rate": 1e-06, + "loss": 0.652, + "mean_token_accuracy": 0.7845841199159622, + "num_tokens": 1553361360.0, + "step": 5402 + }, + { + "epoch": 1.9243098842386464, + "grad_norm": 0.407952219247818, + "learning_rate": 1e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7802934050559998, + "num_tokens": 1553671651.0, + "step": 5403 + }, + { + "epoch": 1.9246660730187, + "grad_norm": 0.4685513973236084, + "learning_rate": 1e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7691300064325333, + "num_tokens": 1553928545.0, + "step": 5404 + }, + { + "epoch": 1.9250222617987534, + "grad_norm": 0.49309274554252625, + "learning_rate": 1e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7796395421028137, + "num_tokens": 1554195683.0, + "step": 5405 + }, + { + "epoch": 1.9253784505788067, + "grad_norm": 0.4956723749637604, + "learning_rate": 1e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7520930022001266, + "num_tokens": 1554459951.0, + "step": 5406 + }, + { + "epoch": 1.9257346393588604, + "grad_norm": 0.47817325592041016, + "learning_rate": 1e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7753801047801971, + "num_tokens": 1554780989.0, + "step": 5407 + }, + { + "epoch": 1.9260908281389137, + "grad_norm": 0.47296860814094543, + "learning_rate": 1e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7739138901233673, + "num_tokens": 1555067728.0, + "step": 5408 + }, + { + "epoch": 1.9264470169189671, + "grad_norm": 0.48278021812438965, + "learning_rate": 1e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.7776900976896286, + "num_tokens": 1555333836.0, + "step": 5409 + }, + { + "epoch": 1.9268032056990205, + "grad_norm": 0.5019027590751648, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.760138988494873, + "num_tokens": 1555608637.0, + "step": 5410 + }, + { + "epoch": 1.927159394479074, + "grad_norm": 0.46667349338531494, + "learning_rate": 1e-06, + "loss": 0.662, + "mean_token_accuracy": 0.7838621437549591, + "num_tokens": 1555906657.0, + "step": 5411 + }, + { + "epoch": 1.9275155832591273, + "grad_norm": 0.4882151484489441, + "learning_rate": 1e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7693597972393036, + "num_tokens": 1556171130.0, + "step": 5412 + }, + { + "epoch": 1.9278717720391807, + "grad_norm": 0.46004053950309753, + "learning_rate": 1e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7619980573654175, + "num_tokens": 1556464286.0, + "step": 5413 + }, + { + "epoch": 1.928227960819234, + "grad_norm": 0.4588375985622406, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7605991661548615, + "num_tokens": 1556758569.0, + "step": 5414 + }, + { + "epoch": 1.9285841495992875, + "grad_norm": 0.4899253249168396, + "learning_rate": 1e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7497306168079376, + "num_tokens": 1557048579.0, + "step": 5415 + }, + { + "epoch": 1.928940338379341, + "grad_norm": 0.46171119809150696, + "learning_rate": 1e-06, + "loss": 0.6643, + "mean_token_accuracy": 0.7837056219577789, + "num_tokens": 1557318854.0, + "step": 5416 + }, + { + "epoch": 1.9292965271593945, + "grad_norm": 0.45861297845840454, + "learning_rate": 1e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7726177722215652, + "num_tokens": 1557616428.0, + "step": 5417 + }, + { + "epoch": 1.9296527159394479, + "grad_norm": 0.5167324542999268, + "learning_rate": 1e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7586971819400787, + "num_tokens": 1557889801.0, + "step": 5418 + }, + { + "epoch": 1.9300089047195015, + "grad_norm": 0.4429628252983093, + "learning_rate": 1e-06, + "loss": 0.7781, + "mean_token_accuracy": 0.754515990614891, + "num_tokens": 1558188138.0, + "step": 5419 + }, + { + "epoch": 1.9303650934995549, + "grad_norm": 0.3948979377746582, + "learning_rate": 1e-06, + "loss": 0.6501, + "mean_token_accuracy": 0.7881987690925598, + "num_tokens": 1558478606.0, + "step": 5420 + }, + { + "epoch": 1.9307212822796083, + "grad_norm": 0.45866426825523376, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7648779600858688, + "num_tokens": 1558759045.0, + "step": 5421 + }, + { + "epoch": 1.9310774710596617, + "grad_norm": 0.5337868332862854, + "learning_rate": 1e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7550013661384583, + "num_tokens": 1559026659.0, + "step": 5422 + }, + { + "epoch": 1.931433659839715, + "grad_norm": 0.49174097180366516, + "learning_rate": 1e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.7743126899003983, + "num_tokens": 1559292738.0, + "step": 5423 + }, + { + "epoch": 1.9317898486197684, + "grad_norm": 0.5129671692848206, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7560589760541916, + "num_tokens": 1559593380.0, + "step": 5424 + }, + { + "epoch": 1.9321460373998218, + "grad_norm": 0.45245417952537537, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7676024734973907, + "num_tokens": 1559896776.0, + "step": 5425 + }, + { + "epoch": 1.9325022261798752, + "grad_norm": 0.45124056935310364, + "learning_rate": 1e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.7711940556764603, + "num_tokens": 1560160513.0, + "step": 5426 + }, + { + "epoch": 1.9328584149599286, + "grad_norm": 0.5085384845733643, + "learning_rate": 1e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7839004993438721, + "num_tokens": 1560436329.0, + "step": 5427 + }, + { + "epoch": 1.9332146037399822, + "grad_norm": 0.5149649381637573, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7700431942939758, + "num_tokens": 1560716746.0, + "step": 5428 + }, + { + "epoch": 1.9335707925200356, + "grad_norm": 0.4282022714614868, + "learning_rate": 1e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.768339604139328, + "num_tokens": 1560996980.0, + "step": 5429 + }, + { + "epoch": 1.933926981300089, + "grad_norm": 0.49611616134643555, + "learning_rate": 1e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7774557322263718, + "num_tokens": 1561255597.0, + "step": 5430 + }, + { + "epoch": 1.9342831700801426, + "grad_norm": 0.46640855073928833, + "learning_rate": 1e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7712514251470566, + "num_tokens": 1561532642.0, + "step": 5431 + }, + { + "epoch": 1.934639358860196, + "grad_norm": 0.4837557077407837, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7726742327213287, + "num_tokens": 1561789132.0, + "step": 5432 + }, + { + "epoch": 1.9349955476402494, + "grad_norm": 0.49820393323898315, + "learning_rate": 1e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7751134186983109, + "num_tokens": 1562059097.0, + "step": 5433 + }, + { + "epoch": 1.9353517364203028, + "grad_norm": 0.424930602312088, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7817541658878326, + "num_tokens": 1562370013.0, + "step": 5434 + }, + { + "epoch": 1.9357079252003562, + "grad_norm": 0.4839213490486145, + "learning_rate": 1e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7686876058578491, + "num_tokens": 1562657679.0, + "step": 5435 + }, + { + "epoch": 1.9360641139804096, + "grad_norm": 0.49510061740875244, + "learning_rate": 1e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7502620220184326, + "num_tokens": 1562975323.0, + "step": 5436 + }, + { + "epoch": 1.936420302760463, + "grad_norm": 0.4455815851688385, + "learning_rate": 1e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7722478806972504, + "num_tokens": 1563292146.0, + "step": 5437 + }, + { + "epoch": 1.9367764915405163, + "grad_norm": 0.47671839594841003, + "learning_rate": 1e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.7746810764074326, + "num_tokens": 1563570596.0, + "step": 5438 + }, + { + "epoch": 1.93713268032057, + "grad_norm": 0.5501291751861572, + "learning_rate": 1e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7598082572221756, + "num_tokens": 1563813089.0, + "step": 5439 + }, + { + "epoch": 1.9374888691006233, + "grad_norm": 0.48446786403656006, + "learning_rate": 1e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.772756889462471, + "num_tokens": 1564106538.0, + "step": 5440 + }, + { + "epoch": 1.9378450578806767, + "grad_norm": 0.4295065402984619, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7560514658689499, + "num_tokens": 1564412375.0, + "step": 5441 + }, + { + "epoch": 1.9382012466607303, + "grad_norm": 0.4679573178291321, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.761592909693718, + "num_tokens": 1564694717.0, + "step": 5442 + }, + { + "epoch": 1.9385574354407837, + "grad_norm": 0.4633748233318329, + "learning_rate": 1e-06, + "loss": 0.697, + "mean_token_accuracy": 0.77280592918396, + "num_tokens": 1564972351.0, + "step": 5443 + }, + { + "epoch": 1.9389136242208371, + "grad_norm": 0.4673953950405121, + "learning_rate": 1e-06, + "loss": 0.697, + "mean_token_accuracy": 0.7780188471078873, + "num_tokens": 1565262422.0, + "step": 5444 + }, + { + "epoch": 1.9392698130008905, + "grad_norm": 0.4436965584754944, + "learning_rate": 1e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7659854888916016, + "num_tokens": 1565553927.0, + "step": 5445 + }, + { + "epoch": 1.939626001780944, + "grad_norm": 0.5061420202255249, + "learning_rate": 1e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.757102757692337, + "num_tokens": 1565800845.0, + "step": 5446 + }, + { + "epoch": 1.9399821905609973, + "grad_norm": 0.4356077015399933, + "learning_rate": 1e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.7679402828216553, + "num_tokens": 1566094160.0, + "step": 5447 + }, + { + "epoch": 1.9403383793410507, + "grad_norm": 0.4713483452796936, + "learning_rate": 1e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7584511935710907, + "num_tokens": 1566390648.0, + "step": 5448 + }, + { + "epoch": 1.940694568121104, + "grad_norm": 0.43408870697021484, + "learning_rate": 1e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.778349906206131, + "num_tokens": 1566686438.0, + "step": 5449 + }, + { + "epoch": 1.9410507569011575, + "grad_norm": 0.46410077810287476, + "learning_rate": 1e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7504910230636597, + "num_tokens": 1566975444.0, + "step": 5450 + }, + { + "epoch": 1.941406945681211, + "grad_norm": 0.46923309564590454, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7604482173919678, + "num_tokens": 1567260450.0, + "step": 5451 + }, + { + "epoch": 1.9417631344612645, + "grad_norm": 0.4452842175960541, + "learning_rate": 1e-06, + "loss": 0.706, + "mean_token_accuracy": 0.774522989988327, + "num_tokens": 1567574944.0, + "step": 5452 + }, + { + "epoch": 1.9421193232413179, + "grad_norm": 0.48274388909339905, + "learning_rate": 1e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7653632909059525, + "num_tokens": 1567842164.0, + "step": 5453 + }, + { + "epoch": 1.9424755120213715, + "grad_norm": 0.5109487771987915, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.744159922003746, + "num_tokens": 1568075747.0, + "step": 5454 + }, + { + "epoch": 1.9428317008014249, + "grad_norm": 0.4571821093559265, + "learning_rate": 1e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7634616792201996, + "num_tokens": 1568382835.0, + "step": 5455 + }, + { + "epoch": 1.9431878895814783, + "grad_norm": 0.5138328671455383, + "learning_rate": 1e-06, + "loss": 0.7065, + "mean_token_accuracy": 0.7748526781797409, + "num_tokens": 1568647148.0, + "step": 5456 + }, + { + "epoch": 1.9435440783615316, + "grad_norm": 0.4906603991985321, + "learning_rate": 1e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7648586481809616, + "num_tokens": 1568925356.0, + "step": 5457 + }, + { + "epoch": 1.943900267141585, + "grad_norm": 0.45284321904182434, + "learning_rate": 1e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.7859360426664352, + "num_tokens": 1569244502.0, + "step": 5458 + }, + { + "epoch": 1.9442564559216384, + "grad_norm": 0.45200473070144653, + "learning_rate": 1e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7669053226709366, + "num_tokens": 1569557007.0, + "step": 5459 + }, + { + "epoch": 1.9446126447016918, + "grad_norm": 0.4075872302055359, + "learning_rate": 1e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.779547706246376, + "num_tokens": 1569865591.0, + "step": 5460 + }, + { + "epoch": 1.9449688334817452, + "grad_norm": 0.48158055543899536, + "learning_rate": 1e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.772932842373848, + "num_tokens": 1570148370.0, + "step": 5461 + }, + { + "epoch": 1.9453250222617986, + "grad_norm": 0.4527236521244049, + "learning_rate": 1e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7695218026638031, + "num_tokens": 1570419966.0, + "step": 5462 + }, + { + "epoch": 1.9456812110418522, + "grad_norm": 0.4389684796333313, + "learning_rate": 1e-06, + "loss": 0.6397, + "mean_token_accuracy": 0.7910161167383194, + "num_tokens": 1570698727.0, + "step": 5463 + }, + { + "epoch": 1.9460373998219056, + "grad_norm": 0.518245279788971, + "learning_rate": 1e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7615911364555359, + "num_tokens": 1570964150.0, + "step": 5464 + }, + { + "epoch": 1.946393588601959, + "grad_norm": 0.4557335674762726, + "learning_rate": 1e-06, + "loss": 0.6689, + "mean_token_accuracy": 0.7845330238342285, + "num_tokens": 1571249588.0, + "step": 5465 + }, + { + "epoch": 1.9467497773820126, + "grad_norm": 0.489254355430603, + "learning_rate": 1e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.7712555229663849, + "num_tokens": 1571520625.0, + "step": 5466 + }, + { + "epoch": 1.947105966162066, + "grad_norm": 0.46510401368141174, + "learning_rate": 1e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7573680728673935, + "num_tokens": 1571827830.0, + "step": 5467 + }, + { + "epoch": 1.9474621549421194, + "grad_norm": 0.45779022574424744, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.768247440457344, + "num_tokens": 1572121357.0, + "step": 5468 + }, + { + "epoch": 1.9478183437221728, + "grad_norm": 0.5213465094566345, + "learning_rate": 1e-06, + "loss": 0.7578, + "mean_token_accuracy": 0.7583455741405487, + "num_tokens": 1572388659.0, + "step": 5469 + }, + { + "epoch": 1.9481745325022262, + "grad_norm": 0.5034201145172119, + "learning_rate": 1e-06, + "loss": 0.6825, + "mean_token_accuracy": 0.7759125530719757, + "num_tokens": 1572666617.0, + "step": 5470 + }, + { + "epoch": 1.9485307212822796, + "grad_norm": 0.4317682981491089, + "learning_rate": 1e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7724210321903229, + "num_tokens": 1572955815.0, + "step": 5471 + }, + { + "epoch": 1.948886910062333, + "grad_norm": 0.47468671202659607, + "learning_rate": 1e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7578070908784866, + "num_tokens": 1573227744.0, + "step": 5472 + }, + { + "epoch": 1.9492430988423863, + "grad_norm": 0.442536324262619, + "learning_rate": 1e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7503780275583267, + "num_tokens": 1573530407.0, + "step": 5473 + }, + { + "epoch": 1.94959928762244, + "grad_norm": 0.4657142758369446, + "learning_rate": 1e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7747666090726852, + "num_tokens": 1573829355.0, + "step": 5474 + }, + { + "epoch": 1.9499554764024933, + "grad_norm": 0.4727543294429779, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7496336996555328, + "num_tokens": 1574096859.0, + "step": 5475 + }, + { + "epoch": 1.9503116651825467, + "grad_norm": 0.48608893156051636, + "learning_rate": 1e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7714705020189285, + "num_tokens": 1574365532.0, + "step": 5476 + }, + { + "epoch": 1.9506678539626003, + "grad_norm": 0.4579472839832306, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.775904506444931, + "num_tokens": 1574655735.0, + "step": 5477 + }, + { + "epoch": 1.9510240427426537, + "grad_norm": 0.45786967873573303, + "learning_rate": 1e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.762546494603157, + "num_tokens": 1574925331.0, + "step": 5478 + }, + { + "epoch": 1.9513802315227071, + "grad_norm": 0.5218304395675659, + "learning_rate": 1e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.7649288028478622, + "num_tokens": 1575198514.0, + "step": 5479 + }, + { + "epoch": 1.9517364203027605, + "grad_norm": 0.4833069443702698, + "learning_rate": 1e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7705733180046082, + "num_tokens": 1575470607.0, + "step": 5480 + }, + { + "epoch": 1.952092609082814, + "grad_norm": 0.40711143612861633, + "learning_rate": 1e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7735403478145599, + "num_tokens": 1575773929.0, + "step": 5481 + }, + { + "epoch": 1.9524487978628673, + "grad_norm": 0.4778437912464142, + "learning_rate": 1e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7766203284263611, + "num_tokens": 1576049329.0, + "step": 5482 + }, + { + "epoch": 1.9528049866429207, + "grad_norm": 0.4895566701889038, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7535852938890457, + "num_tokens": 1576339978.0, + "step": 5483 + }, + { + "epoch": 1.953161175422974, + "grad_norm": 0.45196256041526794, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.771710067987442, + "num_tokens": 1576627602.0, + "step": 5484 + }, + { + "epoch": 1.9535173642030275, + "grad_norm": 0.5300318002700806, + "learning_rate": 1e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.7592290639877319, + "num_tokens": 1576865034.0, + "step": 5485 + }, + { + "epoch": 1.953873552983081, + "grad_norm": 0.49144893884658813, + "learning_rate": 1e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7568999379873276, + "num_tokens": 1577160984.0, + "step": 5486 + }, + { + "epoch": 1.9542297417631345, + "grad_norm": 0.5018950700759888, + "learning_rate": 1e-06, + "loss": 0.6907, + "mean_token_accuracy": 0.7808286994695663, + "num_tokens": 1577447187.0, + "step": 5487 + }, + { + "epoch": 1.9545859305431879, + "grad_norm": 0.4498681128025055, + "learning_rate": 1e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7461265623569489, + "num_tokens": 1577749412.0, + "step": 5488 + }, + { + "epoch": 1.9549421193232415, + "grad_norm": 0.43986061215400696, + "learning_rate": 1e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7671977877616882, + "num_tokens": 1578046599.0, + "step": 5489 + }, + { + "epoch": 1.9552983081032949, + "grad_norm": 0.4795823097229004, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7609041184186935, + "num_tokens": 1578340251.0, + "step": 5490 + }, + { + "epoch": 1.9556544968833482, + "grad_norm": 0.509801983833313, + "learning_rate": 1e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.7846033573150635, + "num_tokens": 1578629918.0, + "step": 5491 + }, + { + "epoch": 1.9560106856634016, + "grad_norm": 0.5058856010437012, + "learning_rate": 1e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7731012850999832, + "num_tokens": 1578905217.0, + "step": 5492 + }, + { + "epoch": 1.956366874443455, + "grad_norm": 0.4754151403903961, + "learning_rate": 1e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.7727643996477127, + "num_tokens": 1579196199.0, + "step": 5493 + }, + { + "epoch": 1.9567230632235084, + "grad_norm": 0.4103291630744934, + "learning_rate": 1e-06, + "loss": 0.784, + "mean_token_accuracy": 0.7536162286996841, + "num_tokens": 1579515114.0, + "step": 5494 + }, + { + "epoch": 1.9570792520035618, + "grad_norm": 0.45001256465911865, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7610194236040115, + "num_tokens": 1579803542.0, + "step": 5495 + }, + { + "epoch": 1.9574354407836152, + "grad_norm": 0.4577392041683197, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7689777910709381, + "num_tokens": 1580080770.0, + "step": 5496 + }, + { + "epoch": 1.9577916295636686, + "grad_norm": 0.4863710403442383, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7679994255304337, + "num_tokens": 1580355648.0, + "step": 5497 + }, + { + "epoch": 1.9581478183437222, + "grad_norm": 0.44373616576194763, + "learning_rate": 1e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7744762301445007, + "num_tokens": 1580678405.0, + "step": 5498 + }, + { + "epoch": 1.9585040071237756, + "grad_norm": 0.48272672295570374, + "learning_rate": 1e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.7766113877296448, + "num_tokens": 1580969222.0, + "step": 5499 + }, + { + "epoch": 1.958860195903829, + "grad_norm": 0.4851745367050171, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7506410479545593, + "num_tokens": 1581225860.0, + "step": 5500 + }, + { + "epoch": 1.9592163846838826, + "grad_norm": 0.46900510787963867, + "learning_rate": 1e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7643536776304245, + "num_tokens": 1581513590.0, + "step": 5501 + }, + { + "epoch": 1.959572573463936, + "grad_norm": 0.43044617772102356, + "learning_rate": 1e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.7615376263856888, + "num_tokens": 1581813979.0, + "step": 5502 + }, + { + "epoch": 1.9599287622439894, + "grad_norm": 0.4564194083213806, + "learning_rate": 1e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7666552066802979, + "num_tokens": 1582134148.0, + "step": 5503 + }, + { + "epoch": 1.9602849510240428, + "grad_norm": 0.48367297649383545, + "learning_rate": 1e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7694279551506042, + "num_tokens": 1582449302.0, + "step": 5504 + }, + { + "epoch": 1.9606411398040962, + "grad_norm": 0.46194592118263245, + "learning_rate": 1e-06, + "loss": 0.695, + "mean_token_accuracy": 0.773886039853096, + "num_tokens": 1582765110.0, + "step": 5505 + }, + { + "epoch": 1.9609973285841495, + "grad_norm": 0.4698546230792999, + "learning_rate": 1e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7594848871231079, + "num_tokens": 1583073972.0, + "step": 5506 + }, + { + "epoch": 1.961353517364203, + "grad_norm": 0.4329126477241516, + "learning_rate": 1e-06, + "loss": 0.671, + "mean_token_accuracy": 0.7856752723455429, + "num_tokens": 1583390857.0, + "step": 5507 + }, + { + "epoch": 1.9617097061442563, + "grad_norm": 0.5168356895446777, + "learning_rate": 1e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7680701613426208, + "num_tokens": 1583682129.0, + "step": 5508 + }, + { + "epoch": 1.96206589492431, + "grad_norm": 0.4865666627883911, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7501659244298935, + "num_tokens": 1583946277.0, + "step": 5509 + }, + { + "epoch": 1.9624220837043633, + "grad_norm": 0.4823319911956787, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7610309273004532, + "num_tokens": 1584210199.0, + "step": 5510 + }, + { + "epoch": 1.9627782724844167, + "grad_norm": 0.4841097593307495, + "learning_rate": 1e-06, + "loss": 0.662, + "mean_token_accuracy": 0.7804813534021378, + "num_tokens": 1584475815.0, + "step": 5511 + }, + { + "epoch": 1.9631344612644703, + "grad_norm": 0.4472742974758148, + "learning_rate": 1e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7653645277023315, + "num_tokens": 1584774514.0, + "step": 5512 + }, + { + "epoch": 1.9634906500445237, + "grad_norm": 0.4876857399940491, + "learning_rate": 1e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7695755213499069, + "num_tokens": 1585095781.0, + "step": 5513 + }, + { + "epoch": 1.9638468388245771, + "grad_norm": 0.47002848982810974, + "learning_rate": 1e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7761745452880859, + "num_tokens": 1585391497.0, + "step": 5514 + }, + { + "epoch": 1.9642030276046305, + "grad_norm": 0.5221803188323975, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7576541155576706, + "num_tokens": 1585674904.0, + "step": 5515 + }, + { + "epoch": 1.964559216384684, + "grad_norm": 0.4753228724002838, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7675047516822815, + "num_tokens": 1585943645.0, + "step": 5516 + }, + { + "epoch": 1.9649154051647373, + "grad_norm": 0.4705134332180023, + "learning_rate": 1e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7840235978364944, + "num_tokens": 1586236924.0, + "step": 5517 + }, + { + "epoch": 1.9652715939447907, + "grad_norm": 0.43835511803627014, + "learning_rate": 1e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7797610759735107, + "num_tokens": 1586555054.0, + "step": 5518 + }, + { + "epoch": 1.965627782724844, + "grad_norm": 0.45756110548973083, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7583010792732239, + "num_tokens": 1586818481.0, + "step": 5519 + }, + { + "epoch": 1.9659839715048975, + "grad_norm": 0.479839026927948, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.773421585559845, + "num_tokens": 1587099690.0, + "step": 5520 + }, + { + "epoch": 1.966340160284951, + "grad_norm": 0.4337911903858185, + "learning_rate": 1e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7545686215162277, + "num_tokens": 1587433026.0, + "step": 5521 + }, + { + "epoch": 1.9666963490650045, + "grad_norm": 0.4673177897930145, + "learning_rate": 1e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7535378336906433, + "num_tokens": 1587694442.0, + "step": 5522 + }, + { + "epoch": 1.9670525378450578, + "grad_norm": 0.41565433144569397, + "learning_rate": 1e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7702723145484924, + "num_tokens": 1588034017.0, + "step": 5523 + }, + { + "epoch": 1.9674087266251115, + "grad_norm": 0.4931044280529022, + "learning_rate": 1e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7764839977025986, + "num_tokens": 1588287235.0, + "step": 5524 + }, + { + "epoch": 1.9677649154051648, + "grad_norm": 0.43749797344207764, + "learning_rate": 1e-06, + "loss": 0.6649, + "mean_token_accuracy": 0.781704843044281, + "num_tokens": 1588566891.0, + "step": 5525 + }, + { + "epoch": 1.9681211041852182, + "grad_norm": 0.5041404366493225, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7660014182329178, + "num_tokens": 1588815417.0, + "step": 5526 + }, + { + "epoch": 1.9684772929652716, + "grad_norm": 0.49604910612106323, + "learning_rate": 1e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7674398720264435, + "num_tokens": 1589070749.0, + "step": 5527 + }, + { + "epoch": 1.968833481745325, + "grad_norm": 0.44380807876586914, + "learning_rate": 1e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7778946757316589, + "num_tokens": 1589353775.0, + "step": 5528 + }, + { + "epoch": 1.9691896705253784, + "grad_norm": 0.4708281457424164, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7573909014463425, + "num_tokens": 1589650652.0, + "step": 5529 + }, + { + "epoch": 1.9695458593054318, + "grad_norm": 0.44994357228279114, + "learning_rate": 1e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.752069354057312, + "num_tokens": 1589959788.0, + "step": 5530 + }, + { + "epoch": 1.9699020480854852, + "grad_norm": 0.4614703059196472, + "learning_rate": 1e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7780864834785461, + "num_tokens": 1590240843.0, + "step": 5531 + }, + { + "epoch": 1.9702582368655386, + "grad_norm": 0.4434351623058319, + "learning_rate": 1e-06, + "loss": 0.6763, + "mean_token_accuracy": 0.7858883589506149, + "num_tokens": 1590533796.0, + "step": 5532 + }, + { + "epoch": 1.9706144256455922, + "grad_norm": 0.44487351179122925, + "learning_rate": 1e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.773015171289444, + "num_tokens": 1590813420.0, + "step": 5533 + }, + { + "epoch": 1.9709706144256456, + "grad_norm": 0.4928366243839264, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7704989016056061, + "num_tokens": 1591119047.0, + "step": 5534 + }, + { + "epoch": 1.971326803205699, + "grad_norm": 0.4728802740573883, + "learning_rate": 1e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7663658857345581, + "num_tokens": 1591418574.0, + "step": 5535 + }, + { + "epoch": 1.9716829919857526, + "grad_norm": 0.43806397914886475, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7722010463476181, + "num_tokens": 1591724236.0, + "step": 5536 + }, + { + "epoch": 1.972039180765806, + "grad_norm": 0.50460284948349, + "learning_rate": 1e-06, + "loss": 0.6968, + "mean_token_accuracy": 0.7801555544137955, + "num_tokens": 1592008011.0, + "step": 5537 + }, + { + "epoch": 1.9723953695458594, + "grad_norm": 0.4174475073814392, + "learning_rate": 1e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7757394760847092, + "num_tokens": 1592310497.0, + "step": 5538 + }, + { + "epoch": 1.9727515583259128, + "grad_norm": 0.48087117075920105, + "learning_rate": 1e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7747387140989304, + "num_tokens": 1592570007.0, + "step": 5539 + }, + { + "epoch": 1.9731077471059661, + "grad_norm": 0.43940213322639465, + "learning_rate": 1e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7633576691150665, + "num_tokens": 1592864195.0, + "step": 5540 + }, + { + "epoch": 1.9734639358860195, + "grad_norm": 0.5075809955596924, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7646771967411041, + "num_tokens": 1593141252.0, + "step": 5541 + }, + { + "epoch": 1.973820124666073, + "grad_norm": 0.5161347985267639, + "learning_rate": 1e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7612256109714508, + "num_tokens": 1593441665.0, + "step": 5542 + }, + { + "epoch": 1.9741763134461263, + "grad_norm": 0.4379039704799652, + "learning_rate": 1e-06, + "loss": 0.6476, + "mean_token_accuracy": 0.7926329374313354, + "num_tokens": 1593746488.0, + "step": 5543 + }, + { + "epoch": 1.97453250222618, + "grad_norm": 0.4794495105743408, + "learning_rate": 1e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7657440155744553, + "num_tokens": 1594036256.0, + "step": 5544 + }, + { + "epoch": 1.9748886910062333, + "grad_norm": 0.4532887637615204, + "learning_rate": 1e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7622545212507248, + "num_tokens": 1594325314.0, + "step": 5545 + }, + { + "epoch": 1.9752448797862867, + "grad_norm": 0.42410650849342346, + "learning_rate": 1e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7668342292308807, + "num_tokens": 1594641367.0, + "step": 5546 + }, + { + "epoch": 1.9756010685663403, + "grad_norm": 0.4357300400733948, + "learning_rate": 1e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7788998484611511, + "num_tokens": 1594980394.0, + "step": 5547 + }, + { + "epoch": 1.9759572573463937, + "grad_norm": 0.518975019454956, + "learning_rate": 1e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.779037356376648, + "num_tokens": 1595253694.0, + "step": 5548 + }, + { + "epoch": 1.976313446126447, + "grad_norm": 0.47926267981529236, + "learning_rate": 1e-06, + "loss": 0.767, + "mean_token_accuracy": 0.7569493502378464, + "num_tokens": 1595516061.0, + "step": 5549 + }, + { + "epoch": 1.9766696349065005, + "grad_norm": 0.45855435729026794, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7645456790924072, + "num_tokens": 1595800832.0, + "step": 5550 + }, + { + "epoch": 1.9770258236865539, + "grad_norm": 0.4507872462272644, + "learning_rate": 1e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7590217590332031, + "num_tokens": 1596078658.0, + "step": 5551 + }, + { + "epoch": 1.9773820124666073, + "grad_norm": 0.4465562701225281, + "learning_rate": 1e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7730863094329834, + "num_tokens": 1596365395.0, + "step": 5552 + }, + { + "epoch": 1.9777382012466607, + "grad_norm": 0.4984028935432434, + "learning_rate": 1e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.775213822722435, + "num_tokens": 1596643494.0, + "step": 5553 + }, + { + "epoch": 1.978094390026714, + "grad_norm": 0.4780808091163635, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.769644170999527, + "num_tokens": 1596940484.0, + "step": 5554 + }, + { + "epoch": 1.9784505788067674, + "grad_norm": 0.43138518929481506, + "learning_rate": 1e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.7576895952224731, + "num_tokens": 1597243059.0, + "step": 5555 + }, + { + "epoch": 1.978806767586821, + "grad_norm": 0.4804530143737793, + "learning_rate": 1e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7543904781341553, + "num_tokens": 1597558525.0, + "step": 5556 + }, + { + "epoch": 1.9791629563668744, + "grad_norm": 0.4414939284324646, + "learning_rate": 1e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7732480317354202, + "num_tokens": 1597848785.0, + "step": 5557 + }, + { + "epoch": 1.9795191451469278, + "grad_norm": 0.42033666372299194, + "learning_rate": 1e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.7864417284727097, + "num_tokens": 1598166222.0, + "step": 5558 + }, + { + "epoch": 1.9798753339269815, + "grad_norm": 0.44236883521080017, + "learning_rate": 1e-06, + "loss": 0.6926, + "mean_token_accuracy": 0.7783053517341614, + "num_tokens": 1598487342.0, + "step": 5559 + }, + { + "epoch": 1.9802315227070348, + "grad_norm": 0.48829036951065063, + "learning_rate": 1e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.772054448723793, + "num_tokens": 1598763870.0, + "step": 5560 + }, + { + "epoch": 1.9805877114870882, + "grad_norm": 0.44400402903556824, + "learning_rate": 1e-06, + "loss": 0.6498, + "mean_token_accuracy": 0.791001558303833, + "num_tokens": 1599071447.0, + "step": 5561 + }, + { + "epoch": 1.9809439002671416, + "grad_norm": 0.4481184482574463, + "learning_rate": 1e-06, + "loss": 0.6502, + "mean_token_accuracy": 0.7879683822393417, + "num_tokens": 1599376219.0, + "step": 5562 + }, + { + "epoch": 1.981300089047195, + "grad_norm": 0.49011197686195374, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7454539835453033, + "num_tokens": 1599654829.0, + "step": 5563 + }, + { + "epoch": 1.9816562778272484, + "grad_norm": 0.520119845867157, + "learning_rate": 1e-06, + "loss": 0.814, + "mean_token_accuracy": 0.7421290725469589, + "num_tokens": 1599909465.0, + "step": 5564 + }, + { + "epoch": 1.9820124666073018, + "grad_norm": 0.4789966940879822, + "learning_rate": 1e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7615353912115097, + "num_tokens": 1600221309.0, + "step": 5565 + }, + { + "epoch": 1.9823686553873552, + "grad_norm": 0.4660099148750305, + "learning_rate": 1e-06, + "loss": 0.6552, + "mean_token_accuracy": 0.7851861268281937, + "num_tokens": 1600501398.0, + "step": 5566 + }, + { + "epoch": 1.9827248441674086, + "grad_norm": 0.49870938062667847, + "learning_rate": 1e-06, + "loss": 0.695, + "mean_token_accuracy": 0.7769932895898819, + "num_tokens": 1600766727.0, + "step": 5567 + }, + { + "epoch": 1.9830810329474622, + "grad_norm": 0.4468436539173126, + "learning_rate": 1e-06, + "loss": 0.679, + "mean_token_accuracy": 0.781102865934372, + "num_tokens": 1601068830.0, + "step": 5568 + }, + { + "epoch": 1.9834372217275156, + "grad_norm": 0.4492303729057312, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7687644809484482, + "num_tokens": 1601388344.0, + "step": 5569 + }, + { + "epoch": 1.983793410507569, + "grad_norm": 0.49886834621429443, + "learning_rate": 1e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7662635296583176, + "num_tokens": 1601664267.0, + "step": 5570 + }, + { + "epoch": 1.9841495992876226, + "grad_norm": 0.48239049315452576, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7568729817867279, + "num_tokens": 1601974702.0, + "step": 5571 + }, + { + "epoch": 1.984505788067676, + "grad_norm": 0.4650186598300934, + "learning_rate": 1e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7610148042440414, + "num_tokens": 1602257799.0, + "step": 5572 + }, + { + "epoch": 1.9848619768477294, + "grad_norm": 0.4354504644870758, + "learning_rate": 1e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7758881598711014, + "num_tokens": 1602562853.0, + "step": 5573 + }, + { + "epoch": 1.9852181656277827, + "grad_norm": 0.49696338176727295, + "learning_rate": 1e-06, + "loss": 0.6682, + "mean_token_accuracy": 0.7828896194696426, + "num_tokens": 1602821132.0, + "step": 5574 + }, + { + "epoch": 1.9855743544078361, + "grad_norm": 0.46946635842323303, + "learning_rate": 1e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7532879412174225, + "num_tokens": 1603106698.0, + "step": 5575 + }, + { + "epoch": 1.9859305431878895, + "grad_norm": 0.4816633462905884, + "learning_rate": 1e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7559236586093903, + "num_tokens": 1603375028.0, + "step": 5576 + }, + { + "epoch": 1.986286731967943, + "grad_norm": 0.46369221806526184, + "learning_rate": 1e-06, + "loss": 0.7547, + "mean_token_accuracy": 0.7613637298345566, + "num_tokens": 1603655665.0, + "step": 5577 + }, + { + "epoch": 1.9866429207479963, + "grad_norm": 0.45219066739082336, + "learning_rate": 1e-06, + "loss": 0.6882, + "mean_token_accuracy": 0.7797054499387741, + "num_tokens": 1603979723.0, + "step": 5578 + }, + { + "epoch": 1.9869991095280497, + "grad_norm": 0.4945601522922516, + "learning_rate": 1e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7722572535276413, + "num_tokens": 1604251955.0, + "step": 5579 + }, + { + "epoch": 1.9873552983081033, + "grad_norm": 0.4658658504486084, + "learning_rate": 1e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7632236927747726, + "num_tokens": 1604544804.0, + "step": 5580 + }, + { + "epoch": 1.9877114870881567, + "grad_norm": 0.44212499260902405, + "learning_rate": 1e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.7552203834056854, + "num_tokens": 1604849306.0, + "step": 5581 + }, + { + "epoch": 1.9880676758682103, + "grad_norm": 0.469224750995636, + "learning_rate": 1e-06, + "loss": 0.6758, + "mean_token_accuracy": 0.7840899527072906, + "num_tokens": 1605176346.0, + "step": 5582 + }, + { + "epoch": 1.9884238646482637, + "grad_norm": 0.503125011920929, + "learning_rate": 1e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7626977860927582, + "num_tokens": 1605430285.0, + "step": 5583 + }, + { + "epoch": 1.988780053428317, + "grad_norm": 0.42485928535461426, + "learning_rate": 1e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.773956224322319, + "num_tokens": 1605713066.0, + "step": 5584 + }, + { + "epoch": 1.9891362422083705, + "grad_norm": 0.5066702365875244, + "learning_rate": 1e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7715466320514679, + "num_tokens": 1606002016.0, + "step": 5585 + }, + { + "epoch": 1.9894924309884239, + "grad_norm": 0.533656120300293, + "learning_rate": 1e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7760118842124939, + "num_tokens": 1606261816.0, + "step": 5586 + }, + { + "epoch": 1.9898486197684773, + "grad_norm": 0.49805116653442383, + "learning_rate": 1e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.7682224214076996, + "num_tokens": 1606540561.0, + "step": 5587 + }, + { + "epoch": 1.9902048085485307, + "grad_norm": 0.5083380937576294, + "learning_rate": 1e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.7561133205890656, + "num_tokens": 1606835725.0, + "step": 5588 + }, + { + "epoch": 1.990560997328584, + "grad_norm": 0.5190730690956116, + "learning_rate": 1e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7604498118162155, + "num_tokens": 1607090315.0, + "step": 5589 + }, + { + "epoch": 1.9909171861086374, + "grad_norm": 0.4424189627170563, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7755170464515686, + "num_tokens": 1607378851.0, + "step": 5590 + }, + { + "epoch": 1.991273374888691, + "grad_norm": 0.4532107710838318, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7413247972726822, + "num_tokens": 1607648394.0, + "step": 5591 + }, + { + "epoch": 1.9916295636687444, + "grad_norm": 0.4780411124229431, + "learning_rate": 1e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7579608708620071, + "num_tokens": 1607961446.0, + "step": 5592 + }, + { + "epoch": 1.9919857524487978, + "grad_norm": 0.45275235176086426, + "learning_rate": 1e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.7805564105510712, + "num_tokens": 1608267104.0, + "step": 5593 + }, + { + "epoch": 1.9923419412288514, + "grad_norm": 0.4729406535625458, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7640907317399979, + "num_tokens": 1608573516.0, + "step": 5594 + }, + { + "epoch": 1.9926981300089048, + "grad_norm": 0.4850616157054901, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7602981477975845, + "num_tokens": 1608857011.0, + "step": 5595 + }, + { + "epoch": 1.9930543187889582, + "grad_norm": 0.5252984762191772, + "learning_rate": 1e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.7742339074611664, + "num_tokens": 1609119432.0, + "step": 5596 + }, + { + "epoch": 1.9934105075690116, + "grad_norm": 0.4368908703327179, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7577608972787857, + "num_tokens": 1609399459.0, + "step": 5597 + }, + { + "epoch": 1.993766696349065, + "grad_norm": 0.4946627914905548, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7549498677253723, + "num_tokens": 1609656165.0, + "step": 5598 + }, + { + "epoch": 1.9941228851291184, + "grad_norm": 0.45117634534835815, + "learning_rate": 1e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.7656417042016983, + "num_tokens": 1609967171.0, + "step": 5599 + }, + { + "epoch": 1.9944790739091718, + "grad_norm": 0.43938854336738586, + "learning_rate": 1e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7596473395824432, + "num_tokens": 1610291934.0, + "step": 5600 + }, + { + "epoch": 1.9948352626892252, + "grad_norm": 0.4738251864910126, + "learning_rate": 1e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7539061009883881, + "num_tokens": 1610561310.0, + "step": 5601 + }, + { + "epoch": 1.9951914514692786, + "grad_norm": 0.46963322162628174, + "learning_rate": 1e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7821420729160309, + "num_tokens": 1610832021.0, + "step": 5602 + }, + { + "epoch": 1.9955476402493322, + "grad_norm": 0.4186444878578186, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7566088140010834, + "num_tokens": 1611124559.0, + "step": 5603 + }, + { + "epoch": 1.9959038290293856, + "grad_norm": 0.47711265087127686, + "learning_rate": 1e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7608002722263336, + "num_tokens": 1611427714.0, + "step": 5604 + }, + { + "epoch": 1.996260017809439, + "grad_norm": 0.5110669732093811, + "learning_rate": 1e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7604494094848633, + "num_tokens": 1611701153.0, + "step": 5605 + }, + { + "epoch": 1.9966162065894926, + "grad_norm": 0.47857171297073364, + "learning_rate": 1e-06, + "loss": 0.6396, + "mean_token_accuracy": 0.7910080999135971, + "num_tokens": 1612030616.0, + "step": 5606 + }, + { + "epoch": 1.996972395369546, + "grad_norm": 0.49254778027534485, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7602660357952118, + "num_tokens": 1612276333.0, + "step": 5607 + }, + { + "epoch": 1.9973285841495994, + "grad_norm": 0.4304145276546478, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7696533650159836, + "num_tokens": 1612586815.0, + "step": 5608 + }, + { + "epoch": 1.9976847729296527, + "grad_norm": 0.4935663342475891, + "learning_rate": 1e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7735784202814102, + "num_tokens": 1612898560.0, + "step": 5609 + }, + { + "epoch": 1.9980409617097061, + "grad_norm": 0.4570833444595337, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7576316893100739, + "num_tokens": 1613202690.0, + "step": 5610 + }, + { + "epoch": 1.9983971504897595, + "grad_norm": 0.4609915316104889, + "learning_rate": 1e-06, + "loss": 0.6515, + "mean_token_accuracy": 0.7897260785102844, + "num_tokens": 1613484579.0, + "step": 5611 + }, + { + "epoch": 1.998753339269813, + "grad_norm": 0.4840622544288635, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7634688168764114, + "num_tokens": 1613777433.0, + "step": 5612 + }, + { + "epoch": 1.9991095280498663, + "grad_norm": 0.5218567252159119, + "learning_rate": 1e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7600423842668533, + "num_tokens": 1614043947.0, + "step": 5613 + }, + { + "epoch": 1.9994657168299197, + "grad_norm": 0.44711068272590637, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7672130912542343, + "num_tokens": 1614335115.0, + "step": 5614 + }, + { + "epoch": 1.9998219056099733, + "grad_norm": 0.45583704113960266, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7575484216213226, + "num_tokens": 1614665310.0, + "step": 5615 + }, + { + "epoch": 2.0, + "grad_norm": 1.0066611766815186, + "learning_rate": 1e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7591507434844971, + "num_tokens": 1614785793.0, + "step": 5616 + }, + { + "epoch": 2.0, + "step": 5616, + "total_flos": 7.271307435875238e+19, + "train_loss": 0.7722587241576269, + "train_runtime": 61686.092, + "train_samples_per_second": 8.738, + "train_steps_per_second": 0.091 + } + ], + "logging_steps": 1, + "max_steps": 5616, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 562, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.271307435875238e+19, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..bb3f49d --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ff6b7fcb01a68f786b5c480c80fff477d80c4d348fd41d35f6776225e113eb2 +size 13265