From 581c7a2eb99d5349a90b7690e1da7e8d566330b4 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 12 Jun 2026 02:43:17 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Neelectric/Llama-3.1-8B-Instruct_SFT_MoTv00.03 Source: Original Platform --- .gitattributes | 36 + README.md | 59 + all_results.json | 8 + chat_template.jinja | 121 + config.json | 35 + generation_config.json | 8 + model-00001-of-00004.safetensors | 3 + model-00002-of-00004.safetensors | 3 + model-00003-of-00004.safetensors | 3 + model-00004-of-00004.safetensors | 3 + model.safetensors.index.json | 299 + special_tokens_map.json | 10 + tokenizer.json | 3 + tokenizer_config.json | 2062 ++ train_results.json | 8 + trainer_state.json | 45538 +++++++++++++++++++++++++++++ training_args.bin | 3 + 17 files changed, 48202 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00004.safetensors create mode 100644 model-00002-of-00004.safetensors create mode 100644 model-00003-of-00004.safetensors create mode 100644 model-00004-of-00004.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..6583337 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/MoT_all_Llama3_8192toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_MoTv00.03 +tags: +- generated_from_trainer +- sft +- open-r1 +- trl +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_MoTv00.03 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/MoT_all_Llama3_8192toks](https://huggingface.co/datasets/Neelectric/MoT_all_Llama3_8192toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_MoTv00.03", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_mot/runs/dd731pjg) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.28.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..6208452 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 1.0910187920114791e+20, + "train_loss": 0.7627655786175638, + "train_runtime": 75075.1021, + "train_samples": 269513, + "train_samples_per_second": 10.77, + "train_steps_per_second": 0.067 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e1d9068 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1996dc1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..53310ee --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49754827cb27ea32ff2d308986ccbd1b06cd5c8713a9bdad1c1965e08092bba1 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..6e2510f --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d610a39b0de253658d9a942db575b2d78a0524ec4e2ad2e54501ccf84c153679 +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..e373ff2 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c374a10b99a8bc26d6d5d3cd762b07d7dfbbbd67277ca6ccf386e35fe8f09838 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..026a31c --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8858dd1a2f52048135e5a331a08379e281f0b441cbd05f06e3e34b239ebdf6cd +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e8f05fa --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8b0c7c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..6208452 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 1.0910187920114791e+20, + "train_loss": 0.7627655786175638, + "train_runtime": 75075.1021, + "train_samples": 269513, + "train_samples_per_second": 10.77, + "train_steps_per_second": 0.067 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..a37f54a --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,45538 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 5055, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005934718100890207, + "grad_norm": 4.329995632171631, + "learning_rate": 0.0, + "loss": 1.2365, + "mean_token_accuracy": 0.6699963808059692, + "num_tokens": 472893.0, + "step": 1 + }, + { + "epoch": 0.0011869436201780415, + "grad_norm": 4.279911518096924, + "learning_rate": 1.9762845849802368e-09, + "loss": 1.1682, + "mean_token_accuracy": 0.6863796710968018, + "num_tokens": 925449.0, + "step": 2 + }, + { + "epoch": 0.0017804154302670622, + "grad_norm": 4.108682632446289, + "learning_rate": 3.9525691699604735e-09, + "loss": 1.1637, + "mean_token_accuracy": 0.6860256195068359, + "num_tokens": 1409116.0, + "step": 3 + }, + { + "epoch": 0.002373887240356083, + "grad_norm": 4.31687593460083, + "learning_rate": 5.928853754940711e-09, + "loss": 1.1809, + "mean_token_accuracy": 0.6821081638336182, + "num_tokens": 1870038.0, + "step": 4 + }, + { + "epoch": 0.002967359050445104, + "grad_norm": 4.2299089431762695, + "learning_rate": 7.905138339920947e-09, + "loss": 1.1806, + "mean_token_accuracy": 0.6818673014640808, + "num_tokens": 2341637.0, + "step": 5 + }, + { + "epoch": 0.0035608308605341245, + "grad_norm": 4.314027786254883, + "learning_rate": 9.881422924901186e-09, + "loss": 1.271, + "mean_token_accuracy": 0.6604929566383362, + "num_tokens": 2828727.0, + "step": 6 + }, + { + "epoch": 0.004154302670623145, + "grad_norm": 4.167956352233887, + "learning_rate": 1.1857707509881422e-08, + "loss": 1.1351, + "mean_token_accuracy": 0.6923144459724426, + "num_tokens": 3305127.0, + "step": 7 + }, + { + "epoch": 0.004747774480712166, + "grad_norm": 4.388026714324951, + "learning_rate": 1.3833992094861659e-08, + "loss": 1.2583, + "mean_token_accuracy": 0.6649380922317505, + "num_tokens": 3782067.0, + "step": 8 + }, + { + "epoch": 0.005341246290801187, + "grad_norm": 4.043204307556152, + "learning_rate": 1.5810276679841894e-08, + "loss": 1.1524, + "mean_token_accuracy": 0.6871377229690552, + "num_tokens": 4273252.0, + "step": 9 + }, + { + "epoch": 0.005934718100890208, + "grad_norm": 4.148902893066406, + "learning_rate": 1.7786561264822136e-08, + "loss": 1.1923, + "mean_token_accuracy": 0.6808280348777771, + "num_tokens": 4755619.0, + "step": 10 + }, + { + "epoch": 0.006528189910979229, + "grad_norm": 4.160733699798584, + "learning_rate": 1.976284584980237e-08, + "loss": 1.1782, + "mean_token_accuracy": 0.6840677857398987, + "num_tokens": 5268865.0, + "step": 11 + }, + { + "epoch": 0.007121661721068249, + "grad_norm": 4.170618534088135, + "learning_rate": 2.1739130434782606e-08, + "loss": 1.1447, + "mean_token_accuracy": 0.6917706727981567, + "num_tokens": 5763367.0, + "step": 12 + }, + { + "epoch": 0.00771513353115727, + "grad_norm": 4.129462718963623, + "learning_rate": 2.3715415019762845e-08, + "loss": 1.2267, + "mean_token_accuracy": 0.6721545457839966, + "num_tokens": 6289713.0, + "step": 13 + }, + { + "epoch": 0.00830860534124629, + "grad_norm": 4.166499137878418, + "learning_rate": 2.5691699604743083e-08, + "loss": 1.1801, + "mean_token_accuracy": 0.6803292036056519, + "num_tokens": 6770213.0, + "step": 14 + }, + { + "epoch": 0.008902077151335312, + "grad_norm": 4.09372091293335, + "learning_rate": 2.7667984189723318e-08, + "loss": 1.1698, + "mean_token_accuracy": 0.683782696723938, + "num_tokens": 7282289.0, + "step": 15 + }, + { + "epoch": 0.009495548961424332, + "grad_norm": 3.9853365421295166, + "learning_rate": 2.9644268774703553e-08, + "loss": 1.0992, + "mean_token_accuracy": 0.7026931643486023, + "num_tokens": 7784786.0, + "step": 16 + }, + { + "epoch": 0.010089020771513354, + "grad_norm": 4.242081165313721, + "learning_rate": 3.162055335968379e-08, + "loss": 1.2255, + "mean_token_accuracy": 0.6729483604431152, + "num_tokens": 8269035.0, + "step": 17 + }, + { + "epoch": 0.010682492581602374, + "grad_norm": 4.341405868530273, + "learning_rate": 3.3596837944664033e-08, + "loss": 1.2082, + "mean_token_accuracy": 0.6760822534561157, + "num_tokens": 8719468.0, + "step": 18 + }, + { + "epoch": 0.011275964391691394, + "grad_norm": 4.24714994430542, + "learning_rate": 3.557312252964427e-08, + "loss": 1.2117, + "mean_token_accuracy": 0.6739240884780884, + "num_tokens": 9201255.0, + "step": 19 + }, + { + "epoch": 0.011869436201780416, + "grad_norm": 4.005700588226318, + "learning_rate": 3.7549407114624504e-08, + "loss": 1.102, + "mean_token_accuracy": 0.7006856203079224, + "num_tokens": 9694142.0, + "step": 20 + }, + { + "epoch": 0.012462908011869436, + "grad_norm": 4.380154609680176, + "learning_rate": 3.952569169960474e-08, + "loss": 1.2092, + "mean_token_accuracy": 0.6761780977249146, + "num_tokens": 10130364.0, + "step": 21 + }, + { + "epoch": 0.013056379821958458, + "grad_norm": 4.262560844421387, + "learning_rate": 4.150197628458498e-08, + "loss": 1.1768, + "mean_token_accuracy": 0.684180736541748, + "num_tokens": 10591839.0, + "step": 22 + }, + { + "epoch": 0.013649851632047478, + "grad_norm": 4.217881679534912, + "learning_rate": 4.347826086956521e-08, + "loss": 1.1816, + "mean_token_accuracy": 0.6814261078834534, + "num_tokens": 11051855.0, + "step": 23 + }, + { + "epoch": 0.014243323442136498, + "grad_norm": 4.25640869140625, + "learning_rate": 4.545454545454545e-08, + "loss": 1.2223, + "mean_token_accuracy": 0.6730862259864807, + "num_tokens": 11530509.0, + "step": 24 + }, + { + "epoch": 0.01483679525222552, + "grad_norm": 4.27303409576416, + "learning_rate": 4.743083003952569e-08, + "loss": 1.1907, + "mean_token_accuracy": 0.6813778281211853, + "num_tokens": 11982833.0, + "step": 25 + }, + { + "epoch": 0.01543026706231454, + "grad_norm": 4.234869480133057, + "learning_rate": 4.940711462450593e-08, + "loss": 1.2363, + "mean_token_accuracy": 0.6681424379348755, + "num_tokens": 12462893.0, + "step": 26 + }, + { + "epoch": 0.016023738872403562, + "grad_norm": 4.263824462890625, + "learning_rate": 5.1383399209486166e-08, + "loss": 1.13, + "mean_token_accuracy": 0.6942803859710693, + "num_tokens": 12912490.0, + "step": 27 + }, + { + "epoch": 0.01661721068249258, + "grad_norm": 4.289609432220459, + "learning_rate": 5.33596837944664e-08, + "loss": 1.2127, + "mean_token_accuracy": 0.676795482635498, + "num_tokens": 13369659.0, + "step": 28 + }, + { + "epoch": 0.017210682492581602, + "grad_norm": 4.4041852951049805, + "learning_rate": 5.5335968379446636e-08, + "loss": 1.2073, + "mean_token_accuracy": 0.6765971183776855, + "num_tokens": 13810677.0, + "step": 29 + }, + { + "epoch": 0.017804154302670624, + "grad_norm": 4.22867488861084, + "learning_rate": 5.7312252964426875e-08, + "loss": 1.1595, + "mean_token_accuracy": 0.688884437084198, + "num_tokens": 14248070.0, + "step": 30 + }, + { + "epoch": 0.018397626112759646, + "grad_norm": 4.145590305328369, + "learning_rate": 5.9288537549407106e-08, + "loss": 1.1613, + "mean_token_accuracy": 0.6871496438980103, + "num_tokens": 14727923.0, + "step": 31 + }, + { + "epoch": 0.018991097922848664, + "grad_norm": 4.20858097076416, + "learning_rate": 6.126482213438735e-08, + "loss": 1.2114, + "mean_token_accuracy": 0.6760462522506714, + "num_tokens": 15204398.0, + "step": 32 + }, + { + "epoch": 0.019584569732937686, + "grad_norm": 4.168067455291748, + "learning_rate": 6.324110671936758e-08, + "loss": 1.181, + "mean_token_accuracy": 0.6820054650306702, + "num_tokens": 15671163.0, + "step": 33 + }, + { + "epoch": 0.020178041543026708, + "grad_norm": 4.010863780975342, + "learning_rate": 6.521739130434782e-08, + "loss": 1.1462, + "mean_token_accuracy": 0.6902179718017578, + "num_tokens": 16168926.0, + "step": 34 + }, + { + "epoch": 0.020771513353115726, + "grad_norm": 3.807529926300049, + "learning_rate": 6.719367588932807e-08, + "loss": 1.1123, + "mean_token_accuracy": 0.6985733509063721, + "num_tokens": 16684427.0, + "step": 35 + }, + { + "epoch": 0.021364985163204748, + "grad_norm": 3.8950304985046387, + "learning_rate": 6.91699604743083e-08, + "loss": 1.1261, + "mean_token_accuracy": 0.6956652402877808, + "num_tokens": 17196245.0, + "step": 36 + }, + { + "epoch": 0.02195845697329377, + "grad_norm": 4.282976150512695, + "learning_rate": 7.114624505928854e-08, + "loss": 1.2496, + "mean_token_accuracy": 0.6639750003814697, + "num_tokens": 17653461.0, + "step": 37 + }, + { + "epoch": 0.022551928783382788, + "grad_norm": 3.99062442779541, + "learning_rate": 7.312252964426877e-08, + "loss": 1.1716, + "mean_token_accuracy": 0.6840022802352905, + "num_tokens": 18145232.0, + "step": 38 + }, + { + "epoch": 0.02314540059347181, + "grad_norm": 4.103357791900635, + "learning_rate": 7.509881422924901e-08, + "loss": 1.2377, + "mean_token_accuracy": 0.6681040525436401, + "num_tokens": 18615536.0, + "step": 39 + }, + { + "epoch": 0.02373887240356083, + "grad_norm": 3.888371706008911, + "learning_rate": 7.707509881422925e-08, + "loss": 1.1926, + "mean_token_accuracy": 0.6795944571495056, + "num_tokens": 19130807.0, + "step": 40 + }, + { + "epoch": 0.02433234421364985, + "grad_norm": 4.255561351776123, + "learning_rate": 7.905138339920948e-08, + "loss": 1.2054, + "mean_token_accuracy": 0.6768653392791748, + "num_tokens": 19562222.0, + "step": 41 + }, + { + "epoch": 0.024925816023738872, + "grad_norm": 4.057368278503418, + "learning_rate": 8.102766798418972e-08, + "loss": 1.1569, + "mean_token_accuracy": 0.6882278919219971, + "num_tokens": 20033124.0, + "step": 42 + }, + { + "epoch": 0.025519287833827894, + "grad_norm": 4.126697063446045, + "learning_rate": 8.300395256916996e-08, + "loss": 1.1528, + "mean_token_accuracy": 0.6856766939163208, + "num_tokens": 20484155.0, + "step": 43 + }, + { + "epoch": 0.026112759643916916, + "grad_norm": 4.091825008392334, + "learning_rate": 8.498023715415019e-08, + "loss": 1.206, + "mean_token_accuracy": 0.6758219003677368, + "num_tokens": 20951421.0, + "step": 44 + }, + { + "epoch": 0.026706231454005934, + "grad_norm": 4.018275737762451, + "learning_rate": 8.695652173913042e-08, + "loss": 1.1426, + "mean_token_accuracy": 0.6891664266586304, + "num_tokens": 21425188.0, + "step": 45 + }, + { + "epoch": 0.027299703264094956, + "grad_norm": 3.7415428161621094, + "learning_rate": 8.893280632411066e-08, + "loss": 1.1112, + "mean_token_accuracy": 0.6948672533035278, + "num_tokens": 21920355.0, + "step": 46 + }, + { + "epoch": 0.027893175074183978, + "grad_norm": 3.7833921909332275, + "learning_rate": 9.09090909090909e-08, + "loss": 1.2233, + "mean_token_accuracy": 0.6694130897521973, + "num_tokens": 22402891.0, + "step": 47 + }, + { + "epoch": 0.028486646884272996, + "grad_norm": 3.6543140411376953, + "learning_rate": 9.288537549407115e-08, + "loss": 1.1598, + "mean_token_accuracy": 0.6836632490158081, + "num_tokens": 22918264.0, + "step": 48 + }, + { + "epoch": 0.029080118694362018, + "grad_norm": 3.796630382537842, + "learning_rate": 9.486166007905138e-08, + "loss": 1.1532, + "mean_token_accuracy": 0.6856861114501953, + "num_tokens": 23382220.0, + "step": 49 + }, + { + "epoch": 0.02967359050445104, + "grad_norm": 3.628998279571533, + "learning_rate": 9.683794466403162e-08, + "loss": 1.1703, + "mean_token_accuracy": 0.68364417552948, + "num_tokens": 23880574.0, + "step": 50 + }, + { + "epoch": 0.030267062314540058, + "grad_norm": 3.635916233062744, + "learning_rate": 9.881422924901186e-08, + "loss": 1.1518, + "mean_token_accuracy": 0.687823474407196, + "num_tokens": 24381018.0, + "step": 51 + }, + { + "epoch": 0.03086053412462908, + "grad_norm": 3.708353281021118, + "learning_rate": 1.007905138339921e-07, + "loss": 1.2111, + "mean_token_accuracy": 0.6727175116539001, + "num_tokens": 24874044.0, + "step": 52 + }, + { + "epoch": 0.0314540059347181, + "grad_norm": 3.680727958679199, + "learning_rate": 1.0276679841897233e-07, + "loss": 1.1963, + "mean_token_accuracy": 0.6756713390350342, + "num_tokens": 25373321.0, + "step": 53 + }, + { + "epoch": 0.032047477744807124, + "grad_norm": 3.6335203647613525, + "learning_rate": 1.0474308300395257e-07, + "loss": 1.1548, + "mean_token_accuracy": 0.6882919073104858, + "num_tokens": 25872911.0, + "step": 54 + }, + { + "epoch": 0.032640949554896145, + "grad_norm": 3.754744529724121, + "learning_rate": 1.067193675889328e-07, + "loss": 1.1373, + "mean_token_accuracy": 0.6903918385505676, + "num_tokens": 26336112.0, + "step": 55 + }, + { + "epoch": 0.03323442136498516, + "grad_norm": 3.687511444091797, + "learning_rate": 1.0869565217391303e-07, + "loss": 1.1197, + "mean_token_accuracy": 0.6952705383300781, + "num_tokens": 26799450.0, + "step": 56 + }, + { + "epoch": 0.03382789317507418, + "grad_norm": 3.6879074573516846, + "learning_rate": 1.1067193675889327e-07, + "loss": 1.1688, + "mean_token_accuracy": 0.6835487484931946, + "num_tokens": 27278023.0, + "step": 57 + }, + { + "epoch": 0.034421364985163204, + "grad_norm": 3.686906337738037, + "learning_rate": 1.1264822134387351e-07, + "loss": 1.1866, + "mean_token_accuracy": 0.6779959201812744, + "num_tokens": 27780542.0, + "step": 58 + }, + { + "epoch": 0.035014836795252226, + "grad_norm": 3.7053730487823486, + "learning_rate": 1.1462450592885375e-07, + "loss": 1.1803, + "mean_token_accuracy": 0.676304042339325, + "num_tokens": 28267537.0, + "step": 59 + }, + { + "epoch": 0.03560830860534125, + "grad_norm": 3.6996984481811523, + "learning_rate": 1.1660079051383399e-07, + "loss": 1.1877, + "mean_token_accuracy": 0.679092288017273, + "num_tokens": 28755186.0, + "step": 60 + }, + { + "epoch": 0.03620178041543027, + "grad_norm": 3.8012301921844482, + "learning_rate": 1.1857707509881421e-07, + "loss": 1.135, + "mean_token_accuracy": 0.6902598142623901, + "num_tokens": 29223570.0, + "step": 61 + }, + { + "epoch": 0.03679525222551929, + "grad_norm": 3.869266986846924, + "learning_rate": 1.2055335968379446e-07, + "loss": 1.1416, + "mean_token_accuracy": 0.6875501871109009, + "num_tokens": 29709207.0, + "step": 62 + }, + { + "epoch": 0.037388724035608306, + "grad_norm": 3.638561964035034, + "learning_rate": 1.225296442687747e-07, + "loss": 1.118, + "mean_token_accuracy": 0.6924474239349365, + "num_tokens": 30234738.0, + "step": 63 + }, + { + "epoch": 0.03798219584569733, + "grad_norm": 3.4501705169677734, + "learning_rate": 1.2450592885375494e-07, + "loss": 1.1207, + "mean_token_accuracy": 0.6902588605880737, + "num_tokens": 30708333.0, + "step": 64 + }, + { + "epoch": 0.03857566765578635, + "grad_norm": 3.1403965950012207, + "learning_rate": 1.2648221343873515e-07, + "loss": 1.1267, + "mean_token_accuracy": 0.6896986961364746, + "num_tokens": 31184252.0, + "step": 65 + }, + { + "epoch": 0.03916913946587537, + "grad_norm": 3.2493629455566406, + "learning_rate": 1.2845849802371542e-07, + "loss": 1.122, + "mean_token_accuracy": 0.6912283897399902, + "num_tokens": 31672966.0, + "step": 66 + }, + { + "epoch": 0.039762611275964393, + "grad_norm": 3.352238893508911, + "learning_rate": 1.3043478260869563e-07, + "loss": 1.1324, + "mean_token_accuracy": 0.6888470649719238, + "num_tokens": 32140039.0, + "step": 67 + }, + { + "epoch": 0.040356083086053415, + "grad_norm": 3.2559096813201904, + "learning_rate": 1.324110671936759e-07, + "loss": 1.1608, + "mean_token_accuracy": 0.6835185885429382, + "num_tokens": 32613695.0, + "step": 68 + }, + { + "epoch": 0.04094955489614243, + "grad_norm": 3.0833325386047363, + "learning_rate": 1.3438735177865613e-07, + "loss": 1.0994, + "mean_token_accuracy": 0.696325421333313, + "num_tokens": 33105817.0, + "step": 69 + }, + { + "epoch": 0.04154302670623145, + "grad_norm": 2.8741934299468994, + "learning_rate": 1.3636363636363635e-07, + "loss": 1.1475, + "mean_token_accuracy": 0.6831724047660828, + "num_tokens": 33598803.0, + "step": 70 + }, + { + "epoch": 0.042136498516320474, + "grad_norm": 2.770341396331787, + "learning_rate": 1.383399209486166e-07, + "loss": 1.1129, + "mean_token_accuracy": 0.6926307082176208, + "num_tokens": 34088321.0, + "step": 71 + }, + { + "epoch": 0.042729970326409496, + "grad_norm": 2.950549364089966, + "learning_rate": 1.4031620553359682e-07, + "loss": 1.1678, + "mean_token_accuracy": 0.6790376901626587, + "num_tokens": 34547859.0, + "step": 72 + }, + { + "epoch": 0.04332344213649852, + "grad_norm": 2.7084901332855225, + "learning_rate": 1.422924901185771e-07, + "loss": 1.1533, + "mean_token_accuracy": 0.6823703050613403, + "num_tokens": 35067219.0, + "step": 73 + }, + { + "epoch": 0.04391691394658754, + "grad_norm": 2.9293346405029297, + "learning_rate": 1.442687747035573e-07, + "loss": 1.1214, + "mean_token_accuracy": 0.6884233951568604, + "num_tokens": 35527669.0, + "step": 74 + }, + { + "epoch": 0.04451038575667656, + "grad_norm": 2.805152654647827, + "learning_rate": 1.4624505928853754e-07, + "loss": 1.1093, + "mean_token_accuracy": 0.6920377016067505, + "num_tokens": 36005349.0, + "step": 75 + }, + { + "epoch": 0.045103857566765576, + "grad_norm": 2.799527168273926, + "learning_rate": 1.4822134387351778e-07, + "loss": 1.0632, + "mean_token_accuracy": 0.7024255990982056, + "num_tokens": 36468326.0, + "step": 76 + }, + { + "epoch": 0.0456973293768546, + "grad_norm": 2.812391996383667, + "learning_rate": 1.5019762845849801e-07, + "loss": 1.1205, + "mean_token_accuracy": 0.6898901462554932, + "num_tokens": 36931347.0, + "step": 77 + }, + { + "epoch": 0.04629080118694362, + "grad_norm": 2.7907679080963135, + "learning_rate": 1.5217391304347825e-07, + "loss": 1.1538, + "mean_token_accuracy": 0.6818432807922363, + "num_tokens": 37375053.0, + "step": 78 + }, + { + "epoch": 0.04688427299703264, + "grad_norm": 2.639336109161377, + "learning_rate": 1.541501976284585e-07, + "loss": 1.1027, + "mean_token_accuracy": 0.6946268081665039, + "num_tokens": 37848219.0, + "step": 79 + }, + { + "epoch": 0.04747774480712166, + "grad_norm": 2.6221699714660645, + "learning_rate": 1.561264822134387e-07, + "loss": 1.0953, + "mean_token_accuracy": 0.693778395652771, + "num_tokens": 38312123.0, + "step": 80 + }, + { + "epoch": 0.048071216617210685, + "grad_norm": 2.616809606552124, + "learning_rate": 1.5810276679841897e-07, + "loss": 1.0858, + "mean_token_accuracy": 0.6969197392463684, + "num_tokens": 38807738.0, + "step": 81 + }, + { + "epoch": 0.0486646884272997, + "grad_norm": 2.5686588287353516, + "learning_rate": 1.600790513833992e-07, + "loss": 1.0549, + "mean_token_accuracy": 0.7045440673828125, + "num_tokens": 39277593.0, + "step": 82 + }, + { + "epoch": 0.04925816023738872, + "grad_norm": 2.608707904815674, + "learning_rate": 1.6205533596837944e-07, + "loss": 1.0788, + "mean_token_accuracy": 0.6978940963745117, + "num_tokens": 39759317.0, + "step": 83 + }, + { + "epoch": 0.049851632047477744, + "grad_norm": 2.510345220565796, + "learning_rate": 1.6403162055335968e-07, + "loss": 1.0672, + "mean_token_accuracy": 0.7006291747093201, + "num_tokens": 40251230.0, + "step": 84 + }, + { + "epoch": 0.050445103857566766, + "grad_norm": 2.54935884475708, + "learning_rate": 1.6600790513833992e-07, + "loss": 1.0642, + "mean_token_accuracy": 0.7044142484664917, + "num_tokens": 40695991.0, + "step": 85 + }, + { + "epoch": 0.05103857566765579, + "grad_norm": 2.41926908493042, + "learning_rate": 1.6798418972332016e-07, + "loss": 1.1702, + "mean_token_accuracy": 0.6754936575889587, + "num_tokens": 41207553.0, + "step": 86 + }, + { + "epoch": 0.05163204747774481, + "grad_norm": 2.234431266784668, + "learning_rate": 1.6996047430830037e-07, + "loss": 1.0422, + "mean_token_accuracy": 0.7096479535102844, + "num_tokens": 41722631.0, + "step": 87 + }, + { + "epoch": 0.05222551928783383, + "grad_norm": 2.159269332885742, + "learning_rate": 1.7193675889328064e-07, + "loss": 1.0627, + "mean_token_accuracy": 0.7024141550064087, + "num_tokens": 42250121.0, + "step": 88 + }, + { + "epoch": 0.052818991097922846, + "grad_norm": 2.117943525314331, + "learning_rate": 1.7391304347826085e-07, + "loss": 1.1335, + "mean_token_accuracy": 0.6838207244873047, + "num_tokens": 42744785.0, + "step": 89 + }, + { + "epoch": 0.05341246290801187, + "grad_norm": 2.0760395526885986, + "learning_rate": 1.7588932806324111e-07, + "loss": 1.0921, + "mean_token_accuracy": 0.6960253715515137, + "num_tokens": 43191180.0, + "step": 90 + }, + { + "epoch": 0.05400593471810089, + "grad_norm": 1.8761813640594482, + "learning_rate": 1.7786561264822133e-07, + "loss": 1.086, + "mean_token_accuracy": 0.6962816715240479, + "num_tokens": 43725350.0, + "step": 91 + }, + { + "epoch": 0.05459940652818991, + "grad_norm": 1.919440746307373, + "learning_rate": 1.7984189723320156e-07, + "loss": 1.0778, + "mean_token_accuracy": 0.6972314119338989, + "num_tokens": 44225153.0, + "step": 92 + }, + { + "epoch": 0.05519287833827893, + "grad_norm": 1.8495317697525024, + "learning_rate": 1.818181818181818e-07, + "loss": 1.0123, + "mean_token_accuracy": 0.7131823301315308, + "num_tokens": 44698246.0, + "step": 93 + }, + { + "epoch": 0.055786350148367955, + "grad_norm": 1.8333457708358765, + "learning_rate": 1.8379446640316204e-07, + "loss": 1.0771, + "mean_token_accuracy": 0.6976057291030884, + "num_tokens": 45151792.0, + "step": 94 + }, + { + "epoch": 0.05637982195845697, + "grad_norm": 1.9086087942123413, + "learning_rate": 1.857707509881423e-07, + "loss": 1.0537, + "mean_token_accuracy": 0.7037248611450195, + "num_tokens": 45597408.0, + "step": 95 + }, + { + "epoch": 0.05697329376854599, + "grad_norm": 1.775078535079956, + "learning_rate": 1.8774703557312252e-07, + "loss": 1.0576, + "mean_token_accuracy": 0.7018061280250549, + "num_tokens": 46060339.0, + "step": 96 + }, + { + "epoch": 0.057566765578635014, + "grad_norm": 1.6162614822387695, + "learning_rate": 1.8972332015810276e-07, + "loss": 1.0375, + "mean_token_accuracy": 0.7056398391723633, + "num_tokens": 46576249.0, + "step": 97 + }, + { + "epoch": 0.058160237388724036, + "grad_norm": 2.0089333057403564, + "learning_rate": 1.91699604743083e-07, + "loss": 1.0712, + "mean_token_accuracy": 0.6985707879066467, + "num_tokens": 47021545.0, + "step": 98 + }, + { + "epoch": 0.05875370919881306, + "grad_norm": 1.6505218744277954, + "learning_rate": 1.9367588932806323e-07, + "loss": 1.0286, + "mean_token_accuracy": 0.7075949907302856, + "num_tokens": 47466265.0, + "step": 99 + }, + { + "epoch": 0.05934718100890208, + "grad_norm": 1.6719398498535156, + "learning_rate": 1.9565217391304347e-07, + "loss": 1.065, + "mean_token_accuracy": 0.700183629989624, + "num_tokens": 47930568.0, + "step": 100 + }, + { + "epoch": 0.0599406528189911, + "grad_norm": 1.7203468084335327, + "learning_rate": 1.976284584980237e-07, + "loss": 1.0822, + "mean_token_accuracy": 0.6927483677864075, + "num_tokens": 48377239.0, + "step": 101 + }, + { + "epoch": 0.060534124629080116, + "grad_norm": 1.6177046298980713, + "learning_rate": 1.9960474308300395e-07, + "loss": 1.0496, + "mean_token_accuracy": 0.7010847330093384, + "num_tokens": 48873101.0, + "step": 102 + }, + { + "epoch": 0.06112759643916914, + "grad_norm": 1.7351739406585693, + "learning_rate": 2.015810276679842e-07, + "loss": 1.0595, + "mean_token_accuracy": 0.7011352181434631, + "num_tokens": 49355297.0, + "step": 103 + }, + { + "epoch": 0.06172106824925816, + "grad_norm": 1.6905772686004639, + "learning_rate": 2.035573122529644e-07, + "loss": 1.0268, + "mean_token_accuracy": 0.7069001197814941, + "num_tokens": 49800945.0, + "step": 104 + }, + { + "epoch": 0.06231454005934718, + "grad_norm": 1.686896562576294, + "learning_rate": 2.0553359683794466e-07, + "loss": 1.0509, + "mean_token_accuracy": 0.7024024724960327, + "num_tokens": 50260390.0, + "step": 105 + }, + { + "epoch": 0.0629080118694362, + "grad_norm": 1.7790712118148804, + "learning_rate": 2.0750988142292488e-07, + "loss": 1.0081, + "mean_token_accuracy": 0.7130228877067566, + "num_tokens": 50756701.0, + "step": 106 + }, + { + "epoch": 0.06350148367952523, + "grad_norm": 1.7002285718917847, + "learning_rate": 2.0948616600790514e-07, + "loss": 0.9911, + "mean_token_accuracy": 0.7170019745826721, + "num_tokens": 51236249.0, + "step": 107 + }, + { + "epoch": 0.06409495548961425, + "grad_norm": 2.500842571258545, + "learning_rate": 2.1146245059288538e-07, + "loss": 0.9922, + "mean_token_accuracy": 0.7167669534683228, + "num_tokens": 51680408.0, + "step": 108 + }, + { + "epoch": 0.06468842729970327, + "grad_norm": 2.2071645259857178, + "learning_rate": 2.134387351778656e-07, + "loss": 1.0369, + "mean_token_accuracy": 0.7075533270835876, + "num_tokens": 52147870.0, + "step": 109 + }, + { + "epoch": 0.06528189910979229, + "grad_norm": 1.6183637380599976, + "learning_rate": 2.1541501976284586e-07, + "loss": 0.9911, + "mean_token_accuracy": 0.7169036865234375, + "num_tokens": 52624349.0, + "step": 110 + }, + { + "epoch": 0.06587537091988131, + "grad_norm": 1.5242271423339844, + "learning_rate": 2.1739130434782607e-07, + "loss": 0.9638, + "mean_token_accuracy": 0.7230547070503235, + "num_tokens": 53088669.0, + "step": 111 + }, + { + "epoch": 0.06646884272997032, + "grad_norm": 2.1905131340026855, + "learning_rate": 2.1936758893280633e-07, + "loss": 1.093, + "mean_token_accuracy": 0.6886640787124634, + "num_tokens": 53526056.0, + "step": 112 + }, + { + "epoch": 0.06706231454005934, + "grad_norm": 1.8787083625793457, + "learning_rate": 2.2134387351778654e-07, + "loss": 1.0278, + "mean_token_accuracy": 0.706005871295929, + "num_tokens": 54005090.0, + "step": 113 + }, + { + "epoch": 0.06765578635014836, + "grad_norm": 1.7665971517562866, + "learning_rate": 2.2332015810276678e-07, + "loss": 1.0165, + "mean_token_accuracy": 0.7090700268745422, + "num_tokens": 54491859.0, + "step": 114 + }, + { + "epoch": 0.06824925816023739, + "grad_norm": 1.5752226114273071, + "learning_rate": 2.2529644268774702e-07, + "loss": 0.9897, + "mean_token_accuracy": 0.7151110172271729, + "num_tokens": 54947141.0, + "step": 115 + }, + { + "epoch": 0.06884272997032641, + "grad_norm": 3.1721315383911133, + "learning_rate": 2.2727272727272726e-07, + "loss": 1.0068, + "mean_token_accuracy": 0.7114243507385254, + "num_tokens": 55434793.0, + "step": 116 + }, + { + "epoch": 0.06943620178041543, + "grad_norm": 2.016284942626953, + "learning_rate": 2.292490118577075e-07, + "loss": 1.034, + "mean_token_accuracy": 0.7052284479141235, + "num_tokens": 55904700.0, + "step": 117 + }, + { + "epoch": 0.07002967359050445, + "grad_norm": 2.148000955581665, + "learning_rate": 2.3122529644268774e-07, + "loss": 0.9838, + "mean_token_accuracy": 0.7181544899940491, + "num_tokens": 56442269.0, + "step": 118 + }, + { + "epoch": 0.07062314540059347, + "grad_norm": 2.1329991817474365, + "learning_rate": 2.3320158102766798e-07, + "loss": 1.0818, + "mean_token_accuracy": 0.6906905174255371, + "num_tokens": 56924526.0, + "step": 119 + }, + { + "epoch": 0.0712166172106825, + "grad_norm": 1.7432703971862793, + "learning_rate": 2.3517786561264821e-07, + "loss": 0.9994, + "mean_token_accuracy": 0.7126615047454834, + "num_tokens": 57378170.0, + "step": 120 + }, + { + "epoch": 0.07181008902077152, + "grad_norm": 2.21308970451355, + "learning_rate": 2.3715415019762843e-07, + "loss": 1.0179, + "mean_token_accuracy": 0.7072174549102783, + "num_tokens": 57869245.0, + "step": 121 + }, + { + "epoch": 0.07240356083086054, + "grad_norm": 1.6621547937393188, + "learning_rate": 2.391304347826087e-07, + "loss": 0.9941, + "mean_token_accuracy": 0.7126469612121582, + "num_tokens": 58344446.0, + "step": 122 + }, + { + "epoch": 0.07299703264094956, + "grad_norm": 2.124418020248413, + "learning_rate": 2.4110671936758893e-07, + "loss": 1.0222, + "mean_token_accuracy": 0.7087111473083496, + "num_tokens": 58829179.0, + "step": 123 + }, + { + "epoch": 0.07359050445103858, + "grad_norm": 1.661022424697876, + "learning_rate": 2.430830039525692e-07, + "loss": 1.0264, + "mean_token_accuracy": 0.7055444717407227, + "num_tokens": 59319168.0, + "step": 124 + }, + { + "epoch": 0.07418397626112759, + "grad_norm": 2.3938417434692383, + "learning_rate": 2.450592885375494e-07, + "loss": 0.9781, + "mean_token_accuracy": 0.7180513143539429, + "num_tokens": 59815565.0, + "step": 125 + }, + { + "epoch": 0.07477744807121661, + "grad_norm": 1.7344286441802979, + "learning_rate": 2.470355731225296e-07, + "loss": 0.9794, + "mean_token_accuracy": 0.7172185182571411, + "num_tokens": 60284241.0, + "step": 126 + }, + { + "epoch": 0.07537091988130563, + "grad_norm": 2.1075172424316406, + "learning_rate": 2.490118577075099e-07, + "loss": 0.9672, + "mean_token_accuracy": 0.719863772392273, + "num_tokens": 60728154.0, + "step": 127 + }, + { + "epoch": 0.07596439169139466, + "grad_norm": 1.369480848312378, + "learning_rate": 2.509881422924901e-07, + "loss": 1.0622, + "mean_token_accuracy": 0.6962934732437134, + "num_tokens": 61240185.0, + "step": 128 + }, + { + "epoch": 0.07655786350148368, + "grad_norm": 1.8134455680847168, + "learning_rate": 2.529644268774703e-07, + "loss": 1.0424, + "mean_token_accuracy": 0.7001723647117615, + "num_tokens": 61717527.0, + "step": 129 + }, + { + "epoch": 0.0771513353115727, + "grad_norm": 1.3999743461608887, + "learning_rate": 2.549407114624506e-07, + "loss": 0.9822, + "mean_token_accuracy": 0.7173259258270264, + "num_tokens": 62188022.0, + "step": 130 + }, + { + "epoch": 0.07774480712166172, + "grad_norm": 1.6534112691879272, + "learning_rate": 2.5691699604743084e-07, + "loss": 1.0145, + "mean_token_accuracy": 0.7100523710250854, + "num_tokens": 62677782.0, + "step": 131 + }, + { + "epoch": 0.07833827893175074, + "grad_norm": 1.7254984378814697, + "learning_rate": 2.5889328063241105e-07, + "loss": 1.0233, + "mean_token_accuracy": 0.7078355550765991, + "num_tokens": 63172990.0, + "step": 132 + }, + { + "epoch": 0.07893175074183977, + "grad_norm": 2.249932289123535, + "learning_rate": 2.6086956521739126e-07, + "loss": 0.9822, + "mean_token_accuracy": 0.714245080947876, + "num_tokens": 63641657.0, + "step": 133 + }, + { + "epoch": 0.07952522255192879, + "grad_norm": 1.5236564874649048, + "learning_rate": 2.628458498023715e-07, + "loss": 1.0146, + "mean_token_accuracy": 0.7063947319984436, + "num_tokens": 64129857.0, + "step": 134 + }, + { + "epoch": 0.08011869436201781, + "grad_norm": 1.242021918296814, + "learning_rate": 2.648221343873518e-07, + "loss": 1.0094, + "mean_token_accuracy": 0.708181619644165, + "num_tokens": 64601329.0, + "step": 135 + }, + { + "epoch": 0.08071216617210683, + "grad_norm": 1.5310629606246948, + "learning_rate": 2.66798418972332e-07, + "loss": 0.9605, + "mean_token_accuracy": 0.7237765789031982, + "num_tokens": 65082898.0, + "step": 136 + }, + { + "epoch": 0.08130563798219585, + "grad_norm": 1.3264816999435425, + "learning_rate": 2.6877470355731227e-07, + "loss": 0.9752, + "mean_token_accuracy": 0.7156744003295898, + "num_tokens": 65562365.0, + "step": 137 + }, + { + "epoch": 0.08189910979228486, + "grad_norm": 1.562675952911377, + "learning_rate": 2.707509881422925e-07, + "loss": 0.9928, + "mean_token_accuracy": 0.7129755020141602, + "num_tokens": 66021443.0, + "step": 138 + }, + { + "epoch": 0.08249258160237388, + "grad_norm": 1.7056370973587036, + "learning_rate": 2.727272727272727e-07, + "loss": 0.9962, + "mean_token_accuracy": 0.7140437960624695, + "num_tokens": 66499992.0, + "step": 139 + }, + { + "epoch": 0.0830860534124629, + "grad_norm": 1.2415610551834106, + "learning_rate": 2.7470355731225296e-07, + "loss": 0.982, + "mean_token_accuracy": 0.715559184551239, + "num_tokens": 67006097.0, + "step": 140 + }, + { + "epoch": 0.08367952522255193, + "grad_norm": 1.3883453607559204, + "learning_rate": 2.766798418972332e-07, + "loss": 0.9879, + "mean_token_accuracy": 0.7110390663146973, + "num_tokens": 67508514.0, + "step": 141 + }, + { + "epoch": 0.08427299703264095, + "grad_norm": 1.3901904821395874, + "learning_rate": 2.7865612648221343e-07, + "loss": 0.9646, + "mean_token_accuracy": 0.7186617851257324, + "num_tokens": 67999281.0, + "step": 142 + }, + { + "epoch": 0.08486646884272997, + "grad_norm": 1.1583325862884521, + "learning_rate": 2.8063241106719364e-07, + "loss": 0.973, + "mean_token_accuracy": 0.7155841588973999, + "num_tokens": 68481799.0, + "step": 143 + }, + { + "epoch": 0.08545994065281899, + "grad_norm": 1.3910459280014038, + "learning_rate": 2.8260869565217386e-07, + "loss": 0.9156, + "mean_token_accuracy": 0.7315620183944702, + "num_tokens": 68996101.0, + "step": 144 + }, + { + "epoch": 0.08605341246290801, + "grad_norm": 1.2492705583572388, + "learning_rate": 2.845849802371542e-07, + "loss": 0.9381, + "mean_token_accuracy": 0.724968433380127, + "num_tokens": 69516583.0, + "step": 145 + }, + { + "epoch": 0.08664688427299704, + "grad_norm": 1.5265825986862183, + "learning_rate": 2.865612648221344e-07, + "loss": 0.9027, + "mean_token_accuracy": 0.7344130873680115, + "num_tokens": 69965529.0, + "step": 146 + }, + { + "epoch": 0.08724035608308606, + "grad_norm": 1.8954845666885376, + "learning_rate": 2.885375494071146e-07, + "loss": 0.9883, + "mean_token_accuracy": 0.714535653591156, + "num_tokens": 70411732.0, + "step": 147 + }, + { + "epoch": 0.08783382789317508, + "grad_norm": 1.432931900024414, + "learning_rate": 2.905138339920948e-07, + "loss": 0.9049, + "mean_token_accuracy": 0.7328714728355408, + "num_tokens": 70909732.0, + "step": 148 + }, + { + "epoch": 0.0884272997032641, + "grad_norm": 1.2919803857803345, + "learning_rate": 2.924901185770751e-07, + "loss": 0.9093, + "mean_token_accuracy": 0.732177734375, + "num_tokens": 71391632.0, + "step": 149 + }, + { + "epoch": 0.08902077151335312, + "grad_norm": 1.5154742002487183, + "learning_rate": 2.9446640316205534e-07, + "loss": 0.9801, + "mean_token_accuracy": 0.7140289545059204, + "num_tokens": 71840316.0, + "step": 150 + }, + { + "epoch": 0.08961424332344213, + "grad_norm": 1.3189611434936523, + "learning_rate": 2.9644268774703555e-07, + "loss": 0.9499, + "mean_token_accuracy": 0.7218482494354248, + "num_tokens": 72315373.0, + "step": 151 + }, + { + "epoch": 0.09020771513353115, + "grad_norm": 1.102960228919983, + "learning_rate": 2.984189723320158e-07, + "loss": 1.0138, + "mean_token_accuracy": 0.708362877368927, + "num_tokens": 72784305.0, + "step": 152 + }, + { + "epoch": 0.09080118694362017, + "grad_norm": 1.5032082796096802, + "learning_rate": 3.0039525691699603e-07, + "loss": 0.9758, + "mean_token_accuracy": 0.714392900466919, + "num_tokens": 73250045.0, + "step": 153 + }, + { + "epoch": 0.0913946587537092, + "grad_norm": 1.4912251234054565, + "learning_rate": 3.0237154150197624e-07, + "loss": 0.9777, + "mean_token_accuracy": 0.7165846824645996, + "num_tokens": 73679541.0, + "step": 154 + }, + { + "epoch": 0.09198813056379822, + "grad_norm": 1.0708222389221191, + "learning_rate": 3.043478260869565e-07, + "loss": 0.9358, + "mean_token_accuracy": 0.7259138822555542, + "num_tokens": 74178258.0, + "step": 155 + }, + { + "epoch": 0.09258160237388724, + "grad_norm": 1.2592315673828125, + "learning_rate": 3.0632411067193677e-07, + "loss": 0.9283, + "mean_token_accuracy": 0.7252132892608643, + "num_tokens": 74679316.0, + "step": 156 + }, + { + "epoch": 0.09317507418397626, + "grad_norm": 1.2956829071044922, + "learning_rate": 3.08300395256917e-07, + "loss": 1.0114, + "mean_token_accuracy": 0.7072532176971436, + "num_tokens": 75146643.0, + "step": 157 + }, + { + "epoch": 0.09376854599406528, + "grad_norm": 1.1924148797988892, + "learning_rate": 3.102766798418972e-07, + "loss": 0.9877, + "mean_token_accuracy": 0.7119830846786499, + "num_tokens": 75623078.0, + "step": 158 + }, + { + "epoch": 0.0943620178041543, + "grad_norm": 1.7402631044387817, + "learning_rate": 3.122529644268774e-07, + "loss": 0.9384, + "mean_token_accuracy": 0.7251740097999573, + "num_tokens": 76053531.0, + "step": 159 + }, + { + "epoch": 0.09495548961424333, + "grad_norm": 1.0908626317977905, + "learning_rate": 3.142292490118577e-07, + "loss": 0.9357, + "mean_token_accuracy": 0.725372314453125, + "num_tokens": 76524650.0, + "step": 160 + }, + { + "epoch": 0.09554896142433235, + "grad_norm": 1.1148658990859985, + "learning_rate": 3.1620553359683794e-07, + "loss": 0.958, + "mean_token_accuracy": 0.7191357016563416, + "num_tokens": 77045032.0, + "step": 161 + }, + { + "epoch": 0.09614243323442137, + "grad_norm": 0.9780623316764832, + "learning_rate": 3.1818181818181815e-07, + "loss": 0.8586, + "mean_token_accuracy": 0.7450464367866516, + "num_tokens": 77545763.0, + "step": 162 + }, + { + "epoch": 0.09673590504451039, + "grad_norm": 1.5936416387557983, + "learning_rate": 3.201581027667984e-07, + "loss": 0.9384, + "mean_token_accuracy": 0.725763201713562, + "num_tokens": 78029432.0, + "step": 163 + }, + { + "epoch": 0.0973293768545994, + "grad_norm": 0.9589027762413025, + "learning_rate": 3.221343873517787e-07, + "loss": 0.9586, + "mean_token_accuracy": 0.7187749743461609, + "num_tokens": 78534858.0, + "step": 164 + }, + { + "epoch": 0.09792284866468842, + "grad_norm": 1.1593360900878906, + "learning_rate": 3.241106719367589e-07, + "loss": 0.9534, + "mean_token_accuracy": 0.7218755483627319, + "num_tokens": 79040176.0, + "step": 165 + }, + { + "epoch": 0.09851632047477744, + "grad_norm": 0.8888615369796753, + "learning_rate": 3.260869565217391e-07, + "loss": 0.929, + "mean_token_accuracy": 0.7254468202590942, + "num_tokens": 79527357.0, + "step": 166 + }, + { + "epoch": 0.09910979228486647, + "grad_norm": 0.993690013885498, + "learning_rate": 3.2806324110671937e-07, + "loss": 0.922, + "mean_token_accuracy": 0.7301520109176636, + "num_tokens": 79990046.0, + "step": 167 + }, + { + "epoch": 0.09970326409495549, + "grad_norm": 1.000402569770813, + "learning_rate": 3.300395256916996e-07, + "loss": 0.8989, + "mean_token_accuracy": 0.7333747148513794, + "num_tokens": 80425907.0, + "step": 168 + }, + { + "epoch": 0.10029673590504451, + "grad_norm": 1.0118838548660278, + "learning_rate": 3.3201581027667984e-07, + "loss": 0.9687, + "mean_token_accuracy": 0.7174763083457947, + "num_tokens": 80908210.0, + "step": 169 + }, + { + "epoch": 0.10089020771513353, + "grad_norm": 0.9258395433425903, + "learning_rate": 3.3399209486166006e-07, + "loss": 0.8517, + "mean_token_accuracy": 0.745137095451355, + "num_tokens": 81393864.0, + "step": 170 + }, + { + "epoch": 0.10148367952522255, + "grad_norm": 0.9628059267997742, + "learning_rate": 3.359683794466403e-07, + "loss": 0.9092, + "mean_token_accuracy": 0.7321975231170654, + "num_tokens": 81886535.0, + "step": 171 + }, + { + "epoch": 0.10207715133531158, + "grad_norm": 0.8438661694526672, + "learning_rate": 3.3794466403162053e-07, + "loss": 0.9523, + "mean_token_accuracy": 0.7204119563102722, + "num_tokens": 82364340.0, + "step": 172 + }, + { + "epoch": 0.1026706231454006, + "grad_norm": 1.373868465423584, + "learning_rate": 3.3992094861660074e-07, + "loss": 0.9223, + "mean_token_accuracy": 0.7285282015800476, + "num_tokens": 82862129.0, + "step": 173 + }, + { + "epoch": 0.10326409495548962, + "grad_norm": 1.3342310190200806, + "learning_rate": 3.4189723320158106e-07, + "loss": 0.9626, + "mean_token_accuracy": 0.7181146144866943, + "num_tokens": 83333641.0, + "step": 174 + }, + { + "epoch": 0.10385756676557864, + "grad_norm": 1.1619677543640137, + "learning_rate": 3.438735177865613e-07, + "loss": 0.9635, + "mean_token_accuracy": 0.7193343639373779, + "num_tokens": 83793263.0, + "step": 175 + }, + { + "epoch": 0.10445103857566766, + "grad_norm": 0.9947423934936523, + "learning_rate": 3.458498023715415e-07, + "loss": 0.9406, + "mean_token_accuracy": 0.7227318286895752, + "num_tokens": 84304833.0, + "step": 176 + }, + { + "epoch": 0.10504451038575667, + "grad_norm": 1.1411129236221313, + "learning_rate": 3.478260869565217e-07, + "loss": 0.9539, + "mean_token_accuracy": 0.7203878164291382, + "num_tokens": 84776947.0, + "step": 177 + }, + { + "epoch": 0.10563798219584569, + "grad_norm": 1.0017553567886353, + "learning_rate": 3.4980237154150196e-07, + "loss": 0.9833, + "mean_token_accuracy": 0.7097288966178894, + "num_tokens": 85231685.0, + "step": 178 + }, + { + "epoch": 0.10623145400593471, + "grad_norm": 0.8843250274658203, + "learning_rate": 3.5177865612648223e-07, + "loss": 1.0081, + "mean_token_accuracy": 0.7057086229324341, + "num_tokens": 85708222.0, + "step": 179 + }, + { + "epoch": 0.10682492581602374, + "grad_norm": 1.128303050994873, + "learning_rate": 3.5375494071146244e-07, + "loss": 0.9471, + "mean_token_accuracy": 0.7210484147071838, + "num_tokens": 86174652.0, + "step": 180 + }, + { + "epoch": 0.10741839762611276, + "grad_norm": 1.010850191116333, + "learning_rate": 3.5573122529644265e-07, + "loss": 0.9453, + "mean_token_accuracy": 0.7196710705757141, + "num_tokens": 86649699.0, + "step": 181 + }, + { + "epoch": 0.10801186943620178, + "grad_norm": 0.8672481775283813, + "learning_rate": 3.577075098814229e-07, + "loss": 0.9519, + "mean_token_accuracy": 0.719495415687561, + "num_tokens": 87158264.0, + "step": 182 + }, + { + "epoch": 0.1086053412462908, + "grad_norm": 0.9689549207687378, + "learning_rate": 3.5968379446640313e-07, + "loss": 0.9649, + "mean_token_accuracy": 0.7153410911560059, + "num_tokens": 87582869.0, + "step": 183 + }, + { + "epoch": 0.10919881305637982, + "grad_norm": 0.8218018412590027, + "learning_rate": 3.616600790513834e-07, + "loss": 0.9228, + "mean_token_accuracy": 0.7258034348487854, + "num_tokens": 88065438.0, + "step": 184 + }, + { + "epoch": 0.10979228486646884, + "grad_norm": 0.9788339138031006, + "learning_rate": 3.636363636363636e-07, + "loss": 0.9, + "mean_token_accuracy": 0.73409503698349, + "num_tokens": 88556784.0, + "step": 185 + }, + { + "epoch": 0.11038575667655787, + "grad_norm": 0.763790488243103, + "learning_rate": 3.6561264822134387e-07, + "loss": 0.9364, + "mean_token_accuracy": 0.7236003875732422, + "num_tokens": 89074406.0, + "step": 186 + }, + { + "epoch": 0.11097922848664689, + "grad_norm": 1.098475694656372, + "learning_rate": 3.675889328063241e-07, + "loss": 0.9038, + "mean_token_accuracy": 0.7292429208755493, + "num_tokens": 89525478.0, + "step": 187 + }, + { + "epoch": 0.11157270029673591, + "grad_norm": 0.8395766019821167, + "learning_rate": 3.695652173913043e-07, + "loss": 0.9214, + "mean_token_accuracy": 0.7291172742843628, + "num_tokens": 90048148.0, + "step": 188 + }, + { + "epoch": 0.11216617210682493, + "grad_norm": 0.9488620162010193, + "learning_rate": 3.715415019762846e-07, + "loss": 0.913, + "mean_token_accuracy": 0.7287275791168213, + "num_tokens": 90529172.0, + "step": 189 + }, + { + "epoch": 0.11275964391691394, + "grad_norm": 0.9313468337059021, + "learning_rate": 3.735177865612648e-07, + "loss": 0.9204, + "mean_token_accuracy": 0.7260603308677673, + "num_tokens": 90977552.0, + "step": 190 + }, + { + "epoch": 0.11335311572700296, + "grad_norm": 0.9116847515106201, + "learning_rate": 3.7549407114624504e-07, + "loss": 0.9254, + "mean_token_accuracy": 0.7245848774909973, + "num_tokens": 91463031.0, + "step": 191 + }, + { + "epoch": 0.11394658753709198, + "grad_norm": 0.8400865793228149, + "learning_rate": 3.7747035573122525e-07, + "loss": 0.9185, + "mean_token_accuracy": 0.7281263470649719, + "num_tokens": 91917226.0, + "step": 192 + }, + { + "epoch": 0.114540059347181, + "grad_norm": 0.7870621085166931, + "learning_rate": 3.794466403162055e-07, + "loss": 0.8958, + "mean_token_accuracy": 0.7334863543510437, + "num_tokens": 92455357.0, + "step": 193 + }, + { + "epoch": 0.11513353115727003, + "grad_norm": 0.8627499341964722, + "learning_rate": 3.814229249011858e-07, + "loss": 0.9585, + "mean_token_accuracy": 0.7181917428970337, + "num_tokens": 92888882.0, + "step": 194 + }, + { + "epoch": 0.11572700296735905, + "grad_norm": 0.741417646408081, + "learning_rate": 3.83399209486166e-07, + "loss": 0.847, + "mean_token_accuracy": 0.7452138662338257, + "num_tokens": 93376011.0, + "step": 195 + }, + { + "epoch": 0.11632047477744807, + "grad_norm": 0.7699609398841858, + "learning_rate": 3.853754940711462e-07, + "loss": 0.885, + "mean_token_accuracy": 0.7346129417419434, + "num_tokens": 93884312.0, + "step": 196 + }, + { + "epoch": 0.1169139465875371, + "grad_norm": 0.8105108141899109, + "learning_rate": 3.8735177865612647e-07, + "loss": 0.8892, + "mean_token_accuracy": 0.7363264560699463, + "num_tokens": 94353048.0, + "step": 197 + }, + { + "epoch": 0.11750741839762611, + "grad_norm": 0.7871448397636414, + "learning_rate": 3.893280632411067e-07, + "loss": 0.9202, + "mean_token_accuracy": 0.727777361869812, + "num_tokens": 94872779.0, + "step": 198 + }, + { + "epoch": 0.11810089020771514, + "grad_norm": 0.702652096748352, + "learning_rate": 3.9130434782608694e-07, + "loss": 0.8999, + "mean_token_accuracy": 0.7326028347015381, + "num_tokens": 95354199.0, + "step": 199 + }, + { + "epoch": 0.11869436201780416, + "grad_norm": 0.723550021648407, + "learning_rate": 3.9328063241106716e-07, + "loss": 0.9232, + "mean_token_accuracy": 0.7263925075531006, + "num_tokens": 95868190.0, + "step": 200 + }, + { + "epoch": 0.11928783382789318, + "grad_norm": 0.7976589202880859, + "learning_rate": 3.952569169960474e-07, + "loss": 0.9315, + "mean_token_accuracy": 0.7257703542709351, + "num_tokens": 96347799.0, + "step": 201 + }, + { + "epoch": 0.1198813056379822, + "grad_norm": 0.7637619972229004, + "learning_rate": 3.9723320158102763e-07, + "loss": 0.9368, + "mean_token_accuracy": 0.7219637036323547, + "num_tokens": 96820811.0, + "step": 202 + }, + { + "epoch": 0.12047477744807121, + "grad_norm": 0.7049606442451477, + "learning_rate": 3.992094861660079e-07, + "loss": 0.9474, + "mean_token_accuracy": 0.7192833423614502, + "num_tokens": 97275892.0, + "step": 203 + }, + { + "epoch": 0.12106824925816023, + "grad_norm": 0.7201499342918396, + "learning_rate": 4.0118577075098816e-07, + "loss": 0.9442, + "mean_token_accuracy": 0.7195571660995483, + "num_tokens": 97764438.0, + "step": 204 + }, + { + "epoch": 0.12166172106824925, + "grad_norm": 0.7859538197517395, + "learning_rate": 4.031620553359684e-07, + "loss": 0.9223, + "mean_token_accuracy": 0.7248430252075195, + "num_tokens": 98210336.0, + "step": 205 + }, + { + "epoch": 0.12225519287833828, + "grad_norm": 0.7355934381484985, + "learning_rate": 4.051383399209486e-07, + "loss": 0.9167, + "mean_token_accuracy": 0.7276939749717712, + "num_tokens": 98734983.0, + "step": 206 + }, + { + "epoch": 0.1228486646884273, + "grad_norm": 0.7770371437072754, + "learning_rate": 4.071146245059288e-07, + "loss": 0.8873, + "mean_token_accuracy": 0.7350645661354065, + "num_tokens": 99199095.0, + "step": 207 + }, + { + "epoch": 0.12344213649851632, + "grad_norm": 0.7033586502075195, + "learning_rate": 4.090909090909091e-07, + "loss": 0.9061, + "mean_token_accuracy": 0.7301281690597534, + "num_tokens": 99693101.0, + "step": 208 + }, + { + "epoch": 0.12403560830860534, + "grad_norm": 0.719533383846283, + "learning_rate": 4.1106719367588933e-07, + "loss": 0.8598, + "mean_token_accuracy": 0.743219256401062, + "num_tokens": 100147084.0, + "step": 209 + }, + { + "epoch": 0.12462908011869436, + "grad_norm": 0.7147944569587708, + "learning_rate": 4.1304347826086954e-07, + "loss": 0.9015, + "mean_token_accuracy": 0.7311413884162903, + "num_tokens": 100676337.0, + "step": 210 + }, + { + "epoch": 0.12522255192878337, + "grad_norm": 0.7242381572723389, + "learning_rate": 4.1501976284584975e-07, + "loss": 0.8822, + "mean_token_accuracy": 0.7351129055023193, + "num_tokens": 101152663.0, + "step": 211 + }, + { + "epoch": 0.1258160237388724, + "grad_norm": 0.7442696690559387, + "learning_rate": 4.1699604743083e-07, + "loss": 0.9044, + "mean_token_accuracy": 0.7292699217796326, + "num_tokens": 101604278.0, + "step": 212 + }, + { + "epoch": 0.12640949554896141, + "grad_norm": 0.7358782887458801, + "learning_rate": 4.189723320158103e-07, + "loss": 0.9484, + "mean_token_accuracy": 0.7198486924171448, + "num_tokens": 102077595.0, + "step": 213 + }, + { + "epoch": 0.12700296735905045, + "grad_norm": 0.6654942631721497, + "learning_rate": 4.209486166007905e-07, + "loss": 0.9063, + "mean_token_accuracy": 0.7291751503944397, + "num_tokens": 102551044.0, + "step": 214 + }, + { + "epoch": 0.12759643916913946, + "grad_norm": 0.7069142460823059, + "learning_rate": 4.2292490118577076e-07, + "loss": 0.8496, + "mean_token_accuracy": 0.7436105012893677, + "num_tokens": 103060022.0, + "step": 215 + }, + { + "epoch": 0.1281899109792285, + "grad_norm": 0.7035040259361267, + "learning_rate": 4.2490118577075097e-07, + "loss": 0.9063, + "mean_token_accuracy": 0.7291296720504761, + "num_tokens": 103546986.0, + "step": 216 + }, + { + "epoch": 0.1287833827893175, + "grad_norm": 0.735276460647583, + "learning_rate": 4.268774703557312e-07, + "loss": 0.8968, + "mean_token_accuracy": 0.730786144733429, + "num_tokens": 103974646.0, + "step": 217 + }, + { + "epoch": 0.12937685459940654, + "grad_norm": 0.6929419040679932, + "learning_rate": 4.2885375494071145e-07, + "loss": 0.9419, + "mean_token_accuracy": 0.7190992832183838, + "num_tokens": 104502639.0, + "step": 218 + }, + { + "epoch": 0.12997032640949555, + "grad_norm": 0.7273895740509033, + "learning_rate": 4.308300395256917e-07, + "loss": 0.9021, + "mean_token_accuracy": 0.7299305200576782, + "num_tokens": 104983252.0, + "step": 219 + }, + { + "epoch": 0.13056379821958458, + "grad_norm": 0.7458178400993347, + "learning_rate": 4.328063241106719e-07, + "loss": 0.9406, + "mean_token_accuracy": 0.7214957475662231, + "num_tokens": 105456082.0, + "step": 220 + }, + { + "epoch": 0.1311572700296736, + "grad_norm": 0.6430400609970093, + "learning_rate": 4.3478260869565214e-07, + "loss": 0.8853, + "mean_token_accuracy": 0.7356036305427551, + "num_tokens": 105936591.0, + "step": 221 + }, + { + "epoch": 0.13175074183976263, + "grad_norm": 0.5935526490211487, + "learning_rate": 4.3675889328063235e-07, + "loss": 0.8859, + "mean_token_accuracy": 0.7353531122207642, + "num_tokens": 106458395.0, + "step": 222 + }, + { + "epoch": 0.13234421364985163, + "grad_norm": 0.647051990032196, + "learning_rate": 4.3873517786561267e-07, + "loss": 0.9149, + "mean_token_accuracy": 0.7281622290611267, + "num_tokens": 106915989.0, + "step": 223 + }, + { + "epoch": 0.13293768545994064, + "grad_norm": 0.5986255407333374, + "learning_rate": 4.407114624505929e-07, + "loss": 0.9053, + "mean_token_accuracy": 0.7289798259735107, + "num_tokens": 107431619.0, + "step": 224 + }, + { + "epoch": 0.13353115727002968, + "grad_norm": 0.6777673959732056, + "learning_rate": 4.426877470355731e-07, + "loss": 0.8596, + "mean_token_accuracy": 0.743748664855957, + "num_tokens": 107931614.0, + "step": 225 + }, + { + "epoch": 0.13412462908011868, + "grad_norm": 0.6723642945289612, + "learning_rate": 4.446640316205533e-07, + "loss": 0.9093, + "mean_token_accuracy": 0.7297874689102173, + "num_tokens": 108405876.0, + "step": 226 + }, + { + "epoch": 0.13471810089020772, + "grad_norm": 1.3468002080917358, + "learning_rate": 4.4664031620553357e-07, + "loss": 0.8956, + "mean_token_accuracy": 0.7319724559783936, + "num_tokens": 108882187.0, + "step": 227 + }, + { + "epoch": 0.13531157270029673, + "grad_norm": 0.7743226885795593, + "learning_rate": 4.4861660079051383e-07, + "loss": 0.8954, + "mean_token_accuracy": 0.7330036163330078, + "num_tokens": 109342350.0, + "step": 228 + }, + { + "epoch": 0.13590504451038576, + "grad_norm": 0.6385214328765869, + "learning_rate": 4.5059288537549404e-07, + "loss": 0.847, + "mean_token_accuracy": 0.745038628578186, + "num_tokens": 109799439.0, + "step": 229 + }, + { + "epoch": 0.13649851632047477, + "grad_norm": 0.7175956964492798, + "learning_rate": 4.525691699604743e-07, + "loss": 0.9214, + "mean_token_accuracy": 0.7266594171524048, + "num_tokens": 110232844.0, + "step": 230 + }, + { + "epoch": 0.1370919881305638, + "grad_norm": 0.7336907982826233, + "learning_rate": 4.545454545454545e-07, + "loss": 0.8818, + "mean_token_accuracy": 0.7356725335121155, + "num_tokens": 110648234.0, + "step": 231 + }, + { + "epoch": 0.13768545994065282, + "grad_norm": 0.8552148342132568, + "learning_rate": 4.5652173913043473e-07, + "loss": 0.8327, + "mean_token_accuracy": 0.7481706142425537, + "num_tokens": 111118601.0, + "step": 232 + }, + { + "epoch": 0.13827893175074185, + "grad_norm": 0.7125935554504395, + "learning_rate": 4.58498023715415e-07, + "loss": 0.8586, + "mean_token_accuracy": 0.7422958016395569, + "num_tokens": 111584487.0, + "step": 233 + }, + { + "epoch": 0.13887240356083086, + "grad_norm": 0.7227843999862671, + "learning_rate": 4.6047430830039526e-07, + "loss": 0.8417, + "mean_token_accuracy": 0.7478727102279663, + "num_tokens": 112080501.0, + "step": 234 + }, + { + "epoch": 0.1394658753709199, + "grad_norm": 0.6569495797157288, + "learning_rate": 4.624505928853755e-07, + "loss": 0.8354, + "mean_token_accuracy": 0.749204158782959, + "num_tokens": 112574826.0, + "step": 235 + }, + { + "epoch": 0.1400593471810089, + "grad_norm": 0.7221552729606628, + "learning_rate": 4.644268774703557e-07, + "loss": 0.8342, + "mean_token_accuracy": 0.7484636306762695, + "num_tokens": 113031569.0, + "step": 236 + }, + { + "epoch": 0.1406528189910979, + "grad_norm": 0.742508053779602, + "learning_rate": 4.6640316205533595e-07, + "loss": 0.8399, + "mean_token_accuracy": 0.7442898750305176, + "num_tokens": 113530601.0, + "step": 237 + }, + { + "epoch": 0.14124629080118695, + "grad_norm": 0.6022343039512634, + "learning_rate": 4.683794466403162e-07, + "loss": 0.9202, + "mean_token_accuracy": 0.7239089012145996, + "num_tokens": 114011145.0, + "step": 238 + }, + { + "epoch": 0.14183976261127595, + "grad_norm": 0.6070155501365662, + "learning_rate": 4.7035573122529643e-07, + "loss": 0.8657, + "mean_token_accuracy": 0.740660548210144, + "num_tokens": 114502308.0, + "step": 239 + }, + { + "epoch": 0.142433234421365, + "grad_norm": 0.6321383118629456, + "learning_rate": 4.7233201581027664e-07, + "loss": 0.9073, + "mean_token_accuracy": 0.7290204763412476, + "num_tokens": 114990095.0, + "step": 240 + }, + { + "epoch": 0.143026706231454, + "grad_norm": 0.6023983955383301, + "learning_rate": 4.7430830039525685e-07, + "loss": 0.907, + "mean_token_accuracy": 0.7266771197319031, + "num_tokens": 115464729.0, + "step": 241 + }, + { + "epoch": 0.14362017804154303, + "grad_norm": 0.6307567358016968, + "learning_rate": 4.7628458498023717e-07, + "loss": 0.9094, + "mean_token_accuracy": 0.7302615642547607, + "num_tokens": 115937102.0, + "step": 242 + }, + { + "epoch": 0.14421364985163204, + "grad_norm": 0.6350536942481995, + "learning_rate": 4.782608695652174e-07, + "loss": 0.8668, + "mean_token_accuracy": 0.7399429082870483, + "num_tokens": 116382659.0, + "step": 243 + }, + { + "epoch": 0.14480712166172108, + "grad_norm": 0.6348956823348999, + "learning_rate": 4.802371541501976e-07, + "loss": 0.907, + "mean_token_accuracy": 0.7282727360725403, + "num_tokens": 116841696.0, + "step": 244 + }, + { + "epoch": 0.14540059347181009, + "grad_norm": 0.6034623384475708, + "learning_rate": 4.822134387351779e-07, + "loss": 0.8746, + "mean_token_accuracy": 0.7382657527923584, + "num_tokens": 117346687.0, + "step": 245 + }, + { + "epoch": 0.14599406528189912, + "grad_norm": 0.5686222314834595, + "learning_rate": 4.841897233201581e-07, + "loss": 0.8445, + "mean_token_accuracy": 0.7451201677322388, + "num_tokens": 117857877.0, + "step": 246 + }, + { + "epoch": 0.14658753709198813, + "grad_norm": 0.617284893989563, + "learning_rate": 4.861660079051384e-07, + "loss": 0.8572, + "mean_token_accuracy": 0.7416367530822754, + "num_tokens": 118341597.0, + "step": 247 + }, + { + "epoch": 0.14718100890207717, + "grad_norm": 0.5639653205871582, + "learning_rate": 4.881422924901186e-07, + "loss": 0.9251, + "mean_token_accuracy": 0.7216596007347107, + "num_tokens": 118856681.0, + "step": 248 + }, + { + "epoch": 0.14777448071216617, + "grad_norm": 0.5582653284072876, + "learning_rate": 4.901185770750988e-07, + "loss": 0.8964, + "mean_token_accuracy": 0.7314809560775757, + "num_tokens": 119425179.0, + "step": 249 + }, + { + "epoch": 0.14836795252225518, + "grad_norm": 0.6271770000457764, + "learning_rate": 4.92094861660079e-07, + "loss": 0.8594, + "mean_token_accuracy": 0.7409073114395142, + "num_tokens": 119882990.0, + "step": 250 + }, + { + "epoch": 0.14896142433234422, + "grad_norm": 0.5710375905036926, + "learning_rate": 4.940711462450592e-07, + "loss": 0.8868, + "mean_token_accuracy": 0.7345120310783386, + "num_tokens": 120366353.0, + "step": 251 + }, + { + "epoch": 0.14955489614243322, + "grad_norm": 0.5671748518943787, + "learning_rate": 4.960474308300396e-07, + "loss": 0.8608, + "mean_token_accuracy": 0.7416208982467651, + "num_tokens": 120862977.0, + "step": 252 + }, + { + "epoch": 0.15014836795252226, + "grad_norm": 0.6262017488479614, + "learning_rate": 4.980237154150198e-07, + "loss": 0.9072, + "mean_token_accuracy": 0.7267822027206421, + "num_tokens": 121344351.0, + "step": 253 + }, + { + "epoch": 0.15074183976261127, + "grad_norm": 0.5783612728118896, + "learning_rate": 5e-07, + "loss": 0.8502, + "mean_token_accuracy": 0.7435365319252014, + "num_tokens": 121857178.0, + "step": 254 + }, + { + "epoch": 0.1513353115727003, + "grad_norm": 0.5740035176277161, + "learning_rate": 5.019762845849802e-07, + "loss": 0.8563, + "mean_token_accuracy": 0.7418029308319092, + "num_tokens": 122385411.0, + "step": 255 + }, + { + "epoch": 0.1519287833827893, + "grad_norm": 0.5782800316810608, + "learning_rate": 5.039525691699604e-07, + "loss": 0.8456, + "mean_token_accuracy": 0.7448956966400146, + "num_tokens": 122870924.0, + "step": 256 + }, + { + "epoch": 0.15252225519287835, + "grad_norm": 0.6165631413459778, + "learning_rate": 5.059288537549406e-07, + "loss": 0.9043, + "mean_token_accuracy": 0.7298426628112793, + "num_tokens": 123337227.0, + "step": 257 + }, + { + "epoch": 0.15311572700296736, + "grad_norm": 0.6141627430915833, + "learning_rate": 5.079051383399209e-07, + "loss": 0.8727, + "mean_token_accuracy": 0.7363383769989014, + "num_tokens": 123827346.0, + "step": 258 + }, + { + "epoch": 0.1537091988130564, + "grad_norm": 0.6024271845817566, + "learning_rate": 5.098814229249012e-07, + "loss": 0.94, + "mean_token_accuracy": 0.7202662229537964, + "num_tokens": 124311063.0, + "step": 259 + }, + { + "epoch": 0.1543026706231454, + "grad_norm": 0.5800171494483948, + "learning_rate": 5.118577075098815e-07, + "loss": 0.9224, + "mean_token_accuracy": 0.7258583307266235, + "num_tokens": 124824910.0, + "step": 260 + }, + { + "epoch": 0.15489614243323443, + "grad_norm": 0.5814382433891296, + "learning_rate": 5.138339920948617e-07, + "loss": 0.8508, + "mean_token_accuracy": 0.7431135177612305, + "num_tokens": 125324107.0, + "step": 261 + }, + { + "epoch": 0.15548961424332344, + "grad_norm": 0.5804517269134521, + "learning_rate": 5.158102766798419e-07, + "loss": 0.8511, + "mean_token_accuracy": 0.7414776086807251, + "num_tokens": 125817886.0, + "step": 262 + }, + { + "epoch": 0.15608308605341245, + "grad_norm": 0.5596232414245605, + "learning_rate": 5.177865612648221e-07, + "loss": 0.8571, + "mean_token_accuracy": 0.7426705360412598, + "num_tokens": 126313647.0, + "step": 263 + }, + { + "epoch": 0.1566765578635015, + "grad_norm": 0.5509746670722961, + "learning_rate": 5.197628458498023e-07, + "loss": 0.8344, + "mean_token_accuracy": 0.7473194003105164, + "num_tokens": 126806452.0, + "step": 264 + }, + { + "epoch": 0.1572700296735905, + "grad_norm": 0.6303498148918152, + "learning_rate": 5.217391304347825e-07, + "loss": 0.8493, + "mean_token_accuracy": 0.7406132817268372, + "num_tokens": 127258519.0, + "step": 265 + }, + { + "epoch": 0.15786350148367953, + "grad_norm": 0.5841927528381348, + "learning_rate": 5.237154150197628e-07, + "loss": 0.848, + "mean_token_accuracy": 0.7447119355201721, + "num_tokens": 127754745.0, + "step": 266 + }, + { + "epoch": 0.15845697329376854, + "grad_norm": 0.5861538648605347, + "learning_rate": 5.25691699604743e-07, + "loss": 0.8713, + "mean_token_accuracy": 0.735825777053833, + "num_tokens": 128227425.0, + "step": 267 + }, + { + "epoch": 0.15905044510385757, + "grad_norm": 0.5899208188056946, + "learning_rate": 5.276679841897233e-07, + "loss": 0.8455, + "mean_token_accuracy": 0.7450999617576599, + "num_tokens": 128663230.0, + "step": 268 + }, + { + "epoch": 0.15964391691394658, + "grad_norm": 0.5580462217330933, + "learning_rate": 5.296442687747036e-07, + "loss": 0.7843, + "mean_token_accuracy": 0.758391261100769, + "num_tokens": 129136834.0, + "step": 269 + }, + { + "epoch": 0.16023738872403562, + "grad_norm": 0.63301020860672, + "learning_rate": 5.316205533596838e-07, + "loss": 0.8825, + "mean_token_accuracy": 0.7338143587112427, + "num_tokens": 129554018.0, + "step": 270 + }, + { + "epoch": 0.16083086053412463, + "grad_norm": 0.6174740791320801, + "learning_rate": 5.33596837944664e-07, + "loss": 0.8834, + "mean_token_accuracy": 0.7333071231842041, + "num_tokens": 129968279.0, + "step": 271 + }, + { + "epoch": 0.16142433234421366, + "grad_norm": 0.579189121723175, + "learning_rate": 5.355731225296442e-07, + "loss": 0.9209, + "mean_token_accuracy": 0.7245802879333496, + "num_tokens": 130443762.0, + "step": 272 + }, + { + "epoch": 0.16201780415430267, + "grad_norm": 0.5629114508628845, + "learning_rate": 5.375494071146245e-07, + "loss": 0.8957, + "mean_token_accuracy": 0.7319962978363037, + "num_tokens": 130931556.0, + "step": 273 + }, + { + "epoch": 0.1626112759643917, + "grad_norm": 0.5565395951271057, + "learning_rate": 5.395256916996047e-07, + "loss": 0.8499, + "mean_token_accuracy": 0.7433028817176819, + "num_tokens": 131458822.0, + "step": 274 + }, + { + "epoch": 0.1632047477744807, + "grad_norm": 0.5914154648780823, + "learning_rate": 5.41501976284585e-07, + "loss": 0.8516, + "mean_token_accuracy": 0.7414354681968689, + "num_tokens": 131915843.0, + "step": 275 + }, + { + "epoch": 0.16379821958456972, + "grad_norm": 0.5892292857170105, + "learning_rate": 5.434782608695652e-07, + "loss": 0.853, + "mean_token_accuracy": 0.7425951957702637, + "num_tokens": 132365463.0, + "step": 276 + }, + { + "epoch": 0.16439169139465876, + "grad_norm": 0.5671647191047668, + "learning_rate": 5.454545454545454e-07, + "loss": 0.9099, + "mean_token_accuracy": 0.7259072065353394, + "num_tokens": 132844686.0, + "step": 277 + }, + { + "epoch": 0.16498516320474776, + "grad_norm": 0.6708762049674988, + "learning_rate": 5.474308300395256e-07, + "loss": 0.8685, + "mean_token_accuracy": 0.7351996302604675, + "num_tokens": 133268317.0, + "step": 278 + }, + { + "epoch": 0.1655786350148368, + "grad_norm": 0.5564829707145691, + "learning_rate": 5.494071146245059e-07, + "loss": 0.8535, + "mean_token_accuracy": 0.743177056312561, + "num_tokens": 133756143.0, + "step": 279 + }, + { + "epoch": 0.1661721068249258, + "grad_norm": 0.5889559388160706, + "learning_rate": 5.513833992094862e-07, + "loss": 0.8156, + "mean_token_accuracy": 0.7534520030021667, + "num_tokens": 134227590.0, + "step": 280 + }, + { + "epoch": 0.16676557863501484, + "grad_norm": 0.6062011122703552, + "learning_rate": 5.533596837944664e-07, + "loss": 0.8514, + "mean_token_accuracy": 0.742458701133728, + "num_tokens": 134688388.0, + "step": 281 + }, + { + "epoch": 0.16735905044510385, + "grad_norm": 0.5948068499565125, + "learning_rate": 5.553359683794467e-07, + "loss": 0.9799, + "mean_token_accuracy": 0.7098896503448486, + "num_tokens": 135189594.0, + "step": 282 + }, + { + "epoch": 0.1679525222551929, + "grad_norm": 0.5917317867279053, + "learning_rate": 5.573122529644269e-07, + "loss": 0.9086, + "mean_token_accuracy": 0.7285333871841431, + "num_tokens": 135668985.0, + "step": 283 + }, + { + "epoch": 0.1685459940652819, + "grad_norm": 0.6109099388122559, + "learning_rate": 5.592885375494071e-07, + "loss": 0.7901, + "mean_token_accuracy": 0.7566694021224976, + "num_tokens": 136165179.0, + "step": 284 + }, + { + "epoch": 0.16913946587537093, + "grad_norm": 0.6253790259361267, + "learning_rate": 5.612648221343873e-07, + "loss": 0.8479, + "mean_token_accuracy": 0.7422863841056824, + "num_tokens": 136584212.0, + "step": 285 + }, + { + "epoch": 0.16973293768545994, + "grad_norm": 0.5906147956848145, + "learning_rate": 5.632411067193675e-07, + "loss": 0.9004, + "mean_token_accuracy": 0.7299912571907043, + "num_tokens": 137075202.0, + "step": 286 + }, + { + "epoch": 0.17032640949554897, + "grad_norm": 0.5868690609931946, + "learning_rate": 5.652173913043477e-07, + "loss": 0.9177, + "mean_token_accuracy": 0.7267017960548401, + "num_tokens": 137565066.0, + "step": 287 + }, + { + "epoch": 0.17091988130563798, + "grad_norm": 0.572472870349884, + "learning_rate": 5.67193675889328e-07, + "loss": 0.8503, + "mean_token_accuracy": 0.7433338165283203, + "num_tokens": 138057296.0, + "step": 288 + }, + { + "epoch": 0.171513353115727, + "grad_norm": 0.5685717463493347, + "learning_rate": 5.691699604743083e-07, + "loss": 0.8826, + "mean_token_accuracy": 0.7342575788497925, + "num_tokens": 138558375.0, + "step": 289 + }, + { + "epoch": 0.17210682492581603, + "grad_norm": 0.5564897656440735, + "learning_rate": 5.711462450592886e-07, + "loss": 0.8429, + "mean_token_accuracy": 0.7437861561775208, + "num_tokens": 139057371.0, + "step": 290 + }, + { + "epoch": 0.17270029673590503, + "grad_norm": 0.5901769995689392, + "learning_rate": 5.731225296442688e-07, + "loss": 0.8356, + "mean_token_accuracy": 0.7454248666763306, + "num_tokens": 139498119.0, + "step": 291 + }, + { + "epoch": 0.17329376854599407, + "grad_norm": 0.584563672542572, + "learning_rate": 5.75098814229249e-07, + "loss": 0.919, + "mean_token_accuracy": 0.7253344058990479, + "num_tokens": 139963713.0, + "step": 292 + }, + { + "epoch": 0.17388724035608308, + "grad_norm": 0.5806412100791931, + "learning_rate": 5.770750988142292e-07, + "loss": 0.8599, + "mean_token_accuracy": 0.7385250329971313, + "num_tokens": 140429338.0, + "step": 293 + }, + { + "epoch": 0.1744807121661721, + "grad_norm": 0.5669295191764832, + "learning_rate": 5.790513833992094e-07, + "loss": 0.8518, + "mean_token_accuracy": 0.7406917810440063, + "num_tokens": 140890399.0, + "step": 294 + }, + { + "epoch": 0.17507418397626112, + "grad_norm": 0.5856143236160278, + "learning_rate": 5.810276679841896e-07, + "loss": 0.9243, + "mean_token_accuracy": 0.7226451635360718, + "num_tokens": 141381065.0, + "step": 295 + }, + { + "epoch": 0.17566765578635016, + "grad_norm": 0.5576003789901733, + "learning_rate": 5.830039525691699e-07, + "loss": 0.8993, + "mean_token_accuracy": 0.7292560935020447, + "num_tokens": 141877468.0, + "step": 296 + }, + { + "epoch": 0.17626112759643917, + "grad_norm": 0.5755586624145508, + "learning_rate": 5.849802371541502e-07, + "loss": 0.8584, + "mean_token_accuracy": 0.7400997877120972, + "num_tokens": 142364256.0, + "step": 297 + }, + { + "epoch": 0.1768545994065282, + "grad_norm": 0.5431115627288818, + "learning_rate": 5.869565217391305e-07, + "loss": 0.8621, + "mean_token_accuracy": 0.7392741441726685, + "num_tokens": 142844699.0, + "step": 298 + }, + { + "epoch": 0.1774480712166172, + "grad_norm": 0.5669428706169128, + "learning_rate": 5.889328063241107e-07, + "loss": 0.8224, + "mean_token_accuracy": 0.7498292922973633, + "num_tokens": 143328179.0, + "step": 299 + }, + { + "epoch": 0.17804154302670624, + "grad_norm": 0.5737453103065491, + "learning_rate": 5.909090909090909e-07, + "loss": 0.898, + "mean_token_accuracy": 0.731471598148346, + "num_tokens": 143807512.0, + "step": 300 + }, + { + "epoch": 0.17863501483679525, + "grad_norm": 0.592812716960907, + "learning_rate": 5.928853754940711e-07, + "loss": 0.8241, + "mean_token_accuracy": 0.748705267906189, + "num_tokens": 144279194.0, + "step": 301 + }, + { + "epoch": 0.17922848664688426, + "grad_norm": 0.5894924402236938, + "learning_rate": 5.948616600790513e-07, + "loss": 0.8199, + "mean_token_accuracy": 0.748826265335083, + "num_tokens": 144757224.0, + "step": 302 + }, + { + "epoch": 0.1798219584569733, + "grad_norm": 0.5428017973899841, + "learning_rate": 5.968379446640316e-07, + "loss": 0.8595, + "mean_token_accuracy": 0.7409827709197998, + "num_tokens": 145304365.0, + "step": 303 + }, + { + "epoch": 0.1804154302670623, + "grad_norm": 0.5847616791725159, + "learning_rate": 5.988142292490118e-07, + "loss": 0.8618, + "mean_token_accuracy": 0.7377108335494995, + "num_tokens": 145808734.0, + "step": 304 + }, + { + "epoch": 0.18100890207715134, + "grad_norm": 0.5626961588859558, + "learning_rate": 6.007905138339921e-07, + "loss": 0.8024, + "mean_token_accuracy": 0.7567036151885986, + "num_tokens": 146308566.0, + "step": 305 + }, + { + "epoch": 0.18160237388724035, + "grad_norm": 0.5920644402503967, + "learning_rate": 6.027667984189723e-07, + "loss": 0.9013, + "mean_token_accuracy": 0.7282958030700684, + "num_tokens": 146781515.0, + "step": 306 + }, + { + "epoch": 0.18219584569732938, + "grad_norm": 0.6019842624664307, + "learning_rate": 6.047430830039525e-07, + "loss": 0.9034, + "mean_token_accuracy": 0.7287179231643677, + "num_tokens": 147222618.0, + "step": 307 + }, + { + "epoch": 0.1827893175074184, + "grad_norm": 0.5624140501022339, + "learning_rate": 6.067193675889328e-07, + "loss": 0.8347, + "mean_token_accuracy": 0.7456941604614258, + "num_tokens": 147671826.0, + "step": 308 + }, + { + "epoch": 0.18338278931750743, + "grad_norm": 0.5645046830177307, + "learning_rate": 6.08695652173913e-07, + "loss": 0.8004, + "mean_token_accuracy": 0.7548108100891113, + "num_tokens": 148201078.0, + "step": 309 + }, + { + "epoch": 0.18397626112759644, + "grad_norm": 0.6150333881378174, + "learning_rate": 6.106719367588933e-07, + "loss": 0.8514, + "mean_token_accuracy": 0.7408661842346191, + "num_tokens": 148662973.0, + "step": 310 + }, + { + "epoch": 0.18456973293768547, + "grad_norm": 0.552893340587616, + "learning_rate": 6.126482213438735e-07, + "loss": 0.8527, + "mean_token_accuracy": 0.7420464754104614, + "num_tokens": 149155843.0, + "step": 311 + }, + { + "epoch": 0.18516320474777448, + "grad_norm": 0.6047899723052979, + "learning_rate": 6.146245059288538e-07, + "loss": 0.8469, + "mean_token_accuracy": 0.7429858446121216, + "num_tokens": 149638089.0, + "step": 312 + }, + { + "epoch": 0.18575667655786351, + "grad_norm": 0.6182069182395935, + "learning_rate": 6.16600790513834e-07, + "loss": 0.7892, + "mean_token_accuracy": 0.7590185403823853, + "num_tokens": 150094321.0, + "step": 313 + }, + { + "epoch": 0.18635014836795252, + "grad_norm": 0.5828617215156555, + "learning_rate": 6.185770750988142e-07, + "loss": 0.8378, + "mean_token_accuracy": 0.7464290857315063, + "num_tokens": 150541731.0, + "step": 314 + }, + { + "epoch": 0.18694362017804153, + "grad_norm": 0.611914336681366, + "learning_rate": 6.205533596837944e-07, + "loss": 0.8862, + "mean_token_accuracy": 0.7356253266334534, + "num_tokens": 151001961.0, + "step": 315 + }, + { + "epoch": 0.18753709198813057, + "grad_norm": 0.5799919962882996, + "learning_rate": 6.225296442687746e-07, + "loss": 0.8076, + "mean_token_accuracy": 0.7521226406097412, + "num_tokens": 151506463.0, + "step": 316 + }, + { + "epoch": 0.18813056379821957, + "grad_norm": 0.5535284280776978, + "learning_rate": 6.245059288537548e-07, + "loss": 0.8704, + "mean_token_accuracy": 0.7361530065536499, + "num_tokens": 151993847.0, + "step": 317 + }, + { + "epoch": 0.1887240356083086, + "grad_norm": 0.5738762617111206, + "learning_rate": 6.264822134387352e-07, + "loss": 0.8218, + "mean_token_accuracy": 0.7480014562606812, + "num_tokens": 152508380.0, + "step": 318 + }, + { + "epoch": 0.18931750741839762, + "grad_norm": 0.5919838547706604, + "learning_rate": 6.284584980237154e-07, + "loss": 0.8741, + "mean_token_accuracy": 0.7348266839981079, + "num_tokens": 152980841.0, + "step": 319 + }, + { + "epoch": 0.18991097922848665, + "grad_norm": 0.5330930948257446, + "learning_rate": 6.304347826086957e-07, + "loss": 0.8157, + "mean_token_accuracy": 0.7487016916275024, + "num_tokens": 153489165.0, + "step": 320 + }, + { + "epoch": 0.19050445103857566, + "grad_norm": 0.6255847215652466, + "learning_rate": 6.324110671936759e-07, + "loss": 0.8529, + "mean_token_accuracy": 0.7410747408866882, + "num_tokens": 153943967.0, + "step": 321 + }, + { + "epoch": 0.1910979228486647, + "grad_norm": 0.5761905312538147, + "learning_rate": 6.343873517786561e-07, + "loss": 0.8747, + "mean_token_accuracy": 0.7355167865753174, + "num_tokens": 154437164.0, + "step": 322 + }, + { + "epoch": 0.1916913946587537, + "grad_norm": 0.5669413805007935, + "learning_rate": 6.363636363636363e-07, + "loss": 0.8187, + "mean_token_accuracy": 0.7504069209098816, + "num_tokens": 154909811.0, + "step": 323 + }, + { + "epoch": 0.19228486646884274, + "grad_norm": 0.5674612522125244, + "learning_rate": 6.383399209486165e-07, + "loss": 0.9015, + "mean_token_accuracy": 0.729096531867981, + "num_tokens": 155407943.0, + "step": 324 + }, + { + "epoch": 0.19287833827893175, + "grad_norm": 0.5600221157073975, + "learning_rate": 6.403162055335968e-07, + "loss": 0.8502, + "mean_token_accuracy": 0.7403324842453003, + "num_tokens": 155873584.0, + "step": 325 + }, + { + "epoch": 0.19347181008902078, + "grad_norm": 0.5857919454574585, + "learning_rate": 6.42292490118577e-07, + "loss": 0.8992, + "mean_token_accuracy": 0.7309184670448303, + "num_tokens": 156347235.0, + "step": 326 + }, + { + "epoch": 0.1940652818991098, + "grad_norm": 0.6223275065422058, + "learning_rate": 6.442687747035574e-07, + "loss": 0.8962, + "mean_token_accuracy": 0.7292686104774475, + "num_tokens": 156823014.0, + "step": 327 + }, + { + "epoch": 0.1946587537091988, + "grad_norm": 0.5925013422966003, + "learning_rate": 6.462450592885376e-07, + "loss": 0.8906, + "mean_token_accuracy": 0.7300329804420471, + "num_tokens": 157270851.0, + "step": 328 + }, + { + "epoch": 0.19525222551928784, + "grad_norm": 0.5605390667915344, + "learning_rate": 6.482213438735178e-07, + "loss": 0.8332, + "mean_token_accuracy": 0.744936466217041, + "num_tokens": 157753532.0, + "step": 329 + }, + { + "epoch": 0.19584569732937684, + "grad_norm": 0.6111634969711304, + "learning_rate": 6.50197628458498e-07, + "loss": 0.8537, + "mean_token_accuracy": 0.7411743402481079, + "num_tokens": 158236880.0, + "step": 330 + }, + { + "epoch": 0.19643916913946588, + "grad_norm": 0.5573429465293884, + "learning_rate": 6.521739130434782e-07, + "loss": 0.7595, + "mean_token_accuracy": 0.7662185430526733, + "num_tokens": 158768034.0, + "step": 331 + }, + { + "epoch": 0.1970326409495549, + "grad_norm": 0.5996273159980774, + "learning_rate": 6.541501976284584e-07, + "loss": 0.8576, + "mean_token_accuracy": 0.7403936386108398, + "num_tokens": 159225062.0, + "step": 332 + }, + { + "epoch": 0.19762611275964392, + "grad_norm": 0.6296471953392029, + "learning_rate": 6.561264822134387e-07, + "loss": 0.8842, + "mean_token_accuracy": 0.7334229946136475, + "num_tokens": 159693525.0, + "step": 333 + }, + { + "epoch": 0.19821958456973293, + "grad_norm": 0.5545525550842285, + "learning_rate": 6.581027667984189e-07, + "loss": 0.8756, + "mean_token_accuracy": 0.7345190644264221, + "num_tokens": 160177242.0, + "step": 334 + }, + { + "epoch": 0.19881305637982197, + "grad_norm": 0.5344577431678772, + "learning_rate": 6.600790513833992e-07, + "loss": 0.8499, + "mean_token_accuracy": 0.7437765598297119, + "num_tokens": 160697022.0, + "step": 335 + }, + { + "epoch": 0.19940652818991098, + "grad_norm": 0.5460211038589478, + "learning_rate": 6.620553359683794e-07, + "loss": 0.8799, + "mean_token_accuracy": 0.7332820892333984, + "num_tokens": 161222970.0, + "step": 336 + }, + { + "epoch": 0.2, + "grad_norm": 0.5988255143165588, + "learning_rate": 6.640316205533597e-07, + "loss": 0.9114, + "mean_token_accuracy": 0.7242366075515747, + "num_tokens": 161675286.0, + "step": 337 + }, + { + "epoch": 0.20059347181008902, + "grad_norm": 0.5549377202987671, + "learning_rate": 6.660079051383399e-07, + "loss": 0.864, + "mean_token_accuracy": 0.7403877377510071, + "num_tokens": 162168654.0, + "step": 338 + }, + { + "epoch": 0.20118694362017805, + "grad_norm": 0.5627023577690125, + "learning_rate": 6.679841897233201e-07, + "loss": 0.8468, + "mean_token_accuracy": 0.7442084550857544, + "num_tokens": 162665316.0, + "step": 339 + }, + { + "epoch": 0.20178041543026706, + "grad_norm": 0.5598716139793396, + "learning_rate": 6.699604743083004e-07, + "loss": 0.8259, + "mean_token_accuracy": 0.747336745262146, + "num_tokens": 163174863.0, + "step": 340 + }, + { + "epoch": 0.20237388724035607, + "grad_norm": 0.5724729895591736, + "learning_rate": 6.719367588932806e-07, + "loss": 0.8558, + "mean_token_accuracy": 0.7393544316291809, + "num_tokens": 163673709.0, + "step": 341 + }, + { + "epoch": 0.2029673590504451, + "grad_norm": 0.5915362238883972, + "learning_rate": 6.739130434782609e-07, + "loss": 0.8831, + "mean_token_accuracy": 0.7312166094779968, + "num_tokens": 164153228.0, + "step": 342 + }, + { + "epoch": 0.20356083086053411, + "grad_norm": 0.5656456351280212, + "learning_rate": 6.758893280632411e-07, + "loss": 0.8486, + "mean_token_accuracy": 0.7433688640594482, + "num_tokens": 164637797.0, + "step": 343 + }, + { + "epoch": 0.20415430267062315, + "grad_norm": 0.5561266541481018, + "learning_rate": 6.778656126482213e-07, + "loss": 0.8138, + "mean_token_accuracy": 0.7502662539482117, + "num_tokens": 165118670.0, + "step": 344 + }, + { + "epoch": 0.20474777448071216, + "grad_norm": 0.6087284088134766, + "learning_rate": 6.798418972332015e-07, + "loss": 0.8762, + "mean_token_accuracy": 0.7341748476028442, + "num_tokens": 165595897.0, + "step": 345 + }, + { + "epoch": 0.2053412462908012, + "grad_norm": 0.5591445565223694, + "learning_rate": 6.818181818181817e-07, + "loss": 0.817, + "mean_token_accuracy": 0.7515691518783569, + "num_tokens": 166080722.0, + "step": 346 + }, + { + "epoch": 0.2059347181008902, + "grad_norm": 0.5930289626121521, + "learning_rate": 6.837944664031621e-07, + "loss": 0.806, + "mean_token_accuracy": 0.7516210675239563, + "num_tokens": 166551725.0, + "step": 347 + }, + { + "epoch": 0.20652818991097924, + "grad_norm": 0.5761884450912476, + "learning_rate": 6.857707509881423e-07, + "loss": 0.8372, + "mean_token_accuracy": 0.7439178228378296, + "num_tokens": 166992068.0, + "step": 348 + }, + { + "epoch": 0.20712166172106825, + "grad_norm": 0.5876266360282898, + "learning_rate": 6.877470355731225e-07, + "loss": 0.8554, + "mean_token_accuracy": 0.7406681776046753, + "num_tokens": 167517849.0, + "step": 349 + }, + { + "epoch": 0.20771513353115728, + "grad_norm": 0.5640336871147156, + "learning_rate": 6.897233201581028e-07, + "loss": 0.8328, + "mean_token_accuracy": 0.7456263303756714, + "num_tokens": 167972111.0, + "step": 350 + }, + { + "epoch": 0.2083086053412463, + "grad_norm": 0.577665388584137, + "learning_rate": 6.91699604743083e-07, + "loss": 0.8199, + "mean_token_accuracy": 0.7503520250320435, + "num_tokens": 168470998.0, + "step": 351 + }, + { + "epoch": 0.20890207715133532, + "grad_norm": 0.5662756562232971, + "learning_rate": 6.936758893280632e-07, + "loss": 0.8339, + "mean_token_accuracy": 0.7450497150421143, + "num_tokens": 168984401.0, + "step": 352 + }, + { + "epoch": 0.20949554896142433, + "grad_norm": 0.5389705300331116, + "learning_rate": 6.956521739130434e-07, + "loss": 0.8155, + "mean_token_accuracy": 0.7502873539924622, + "num_tokens": 169479298.0, + "step": 353 + }, + { + "epoch": 0.21008902077151334, + "grad_norm": 0.5754348635673523, + "learning_rate": 6.976284584980236e-07, + "loss": 0.8055, + "mean_token_accuracy": 0.7505828142166138, + "num_tokens": 169938895.0, + "step": 354 + }, + { + "epoch": 0.21068249258160238, + "grad_norm": 0.5941538214683533, + "learning_rate": 6.996047430830039e-07, + "loss": 0.877, + "mean_token_accuracy": 0.7340965270996094, + "num_tokens": 170389719.0, + "step": 355 + }, + { + "epoch": 0.21127596439169138, + "grad_norm": 0.551005482673645, + "learning_rate": 7.015810276679841e-07, + "loss": 0.7617, + "mean_token_accuracy": 0.7647874355316162, + "num_tokens": 170872711.0, + "step": 356 + }, + { + "epoch": 0.21186943620178042, + "grad_norm": 0.5665827989578247, + "learning_rate": 7.035573122529645e-07, + "loss": 0.8508, + "mean_token_accuracy": 0.7410812377929688, + "num_tokens": 171329623.0, + "step": 357 + }, + { + "epoch": 0.21246290801186943, + "grad_norm": 0.5490310192108154, + "learning_rate": 7.055335968379447e-07, + "loss": 0.7943, + "mean_token_accuracy": 0.7558802366256714, + "num_tokens": 171835091.0, + "step": 358 + }, + { + "epoch": 0.21305637982195846, + "grad_norm": 0.5378292798995972, + "learning_rate": 7.075098814229249e-07, + "loss": 0.789, + "mean_token_accuracy": 0.757323145866394, + "num_tokens": 172332008.0, + "step": 359 + }, + { + "epoch": 0.21364985163204747, + "grad_norm": 0.5569384694099426, + "learning_rate": 7.094861660079051e-07, + "loss": 0.8444, + "mean_token_accuracy": 0.744402289390564, + "num_tokens": 172822978.0, + "step": 360 + }, + { + "epoch": 0.2142433234421365, + "grad_norm": 0.5686014294624329, + "learning_rate": 7.114624505928853e-07, + "loss": 0.8172, + "mean_token_accuracy": 0.7492443919181824, + "num_tokens": 173285271.0, + "step": 361 + }, + { + "epoch": 0.21483679525222552, + "grad_norm": 0.5913887619972229, + "learning_rate": 7.134387351778656e-07, + "loss": 0.8506, + "mean_token_accuracy": 0.740074872970581, + "num_tokens": 173767183.0, + "step": 362 + }, + { + "epoch": 0.21543026706231455, + "grad_norm": 0.5815613865852356, + "learning_rate": 7.154150197628458e-07, + "loss": 0.8454, + "mean_token_accuracy": 0.7440515756607056, + "num_tokens": 174223573.0, + "step": 363 + }, + { + "epoch": 0.21602373887240356, + "grad_norm": 0.5671404600143433, + "learning_rate": 7.17391304347826e-07, + "loss": 0.8505, + "mean_token_accuracy": 0.7400871515274048, + "num_tokens": 174720443.0, + "step": 364 + }, + { + "epoch": 0.2166172106824926, + "grad_norm": 0.5504804253578186, + "learning_rate": 7.193675889328063e-07, + "loss": 0.8614, + "mean_token_accuracy": 0.7389400601387024, + "num_tokens": 175234643.0, + "step": 365 + }, + { + "epoch": 0.2172106824925816, + "grad_norm": 0.569360077381134, + "learning_rate": 7.213438735177866e-07, + "loss": 0.8353, + "mean_token_accuracy": 0.7429498434066772, + "num_tokens": 175693895.0, + "step": 366 + }, + { + "epoch": 0.2178041543026706, + "grad_norm": 0.606054425239563, + "learning_rate": 7.233201581027668e-07, + "loss": 0.8578, + "mean_token_accuracy": 0.7409372329711914, + "num_tokens": 176150505.0, + "step": 367 + }, + { + "epoch": 0.21839762611275965, + "grad_norm": 0.6412845253944397, + "learning_rate": 7.25296442687747e-07, + "loss": 0.8582, + "mean_token_accuracy": 0.7393713593482971, + "num_tokens": 176603953.0, + "step": 368 + }, + { + "epoch": 0.21899109792284865, + "grad_norm": 0.5556405782699585, + "learning_rate": 7.272727272727272e-07, + "loss": 0.8657, + "mean_token_accuracy": 0.7378034591674805, + "num_tokens": 177087128.0, + "step": 369 + }, + { + "epoch": 0.2195845697329377, + "grad_norm": 0.5570692420005798, + "learning_rate": 7.292490118577075e-07, + "loss": 0.8299, + "mean_token_accuracy": 0.7470927238464355, + "num_tokens": 177624953.0, + "step": 370 + }, + { + "epoch": 0.2201780415430267, + "grad_norm": 0.6039096117019653, + "learning_rate": 7.312252964426877e-07, + "loss": 0.7872, + "mean_token_accuracy": 0.757017970085144, + "num_tokens": 178039882.0, + "step": 371 + }, + { + "epoch": 0.22077151335311573, + "grad_norm": 0.5874569416046143, + "learning_rate": 7.33201581027668e-07, + "loss": 0.9135, + "mean_token_accuracy": 0.7259349822998047, + "num_tokens": 178482035.0, + "step": 372 + }, + { + "epoch": 0.22136498516320474, + "grad_norm": 0.5593417882919312, + "learning_rate": 7.351778656126482e-07, + "loss": 0.8193, + "mean_token_accuracy": 0.7493418455123901, + "num_tokens": 178975220.0, + "step": 373 + }, + { + "epoch": 0.22195845697329378, + "grad_norm": 0.6175857186317444, + "learning_rate": 7.371541501976284e-07, + "loss": 0.8236, + "mean_token_accuracy": 0.7478668689727783, + "num_tokens": 179403098.0, + "step": 374 + }, + { + "epoch": 0.22255192878338279, + "grad_norm": 0.5695732831954956, + "learning_rate": 7.391304347826086e-07, + "loss": 0.8542, + "mean_token_accuracy": 0.7405696511268616, + "num_tokens": 179908219.0, + "step": 375 + }, + { + "epoch": 0.22314540059347182, + "grad_norm": 0.5567691326141357, + "learning_rate": 7.411067193675889e-07, + "loss": 0.8523, + "mean_token_accuracy": 0.74085533618927, + "num_tokens": 180392882.0, + "step": 376 + }, + { + "epoch": 0.22373887240356083, + "grad_norm": 0.5658779740333557, + "learning_rate": 7.430830039525692e-07, + "loss": 0.8656, + "mean_token_accuracy": 0.7377575635910034, + "num_tokens": 180869576.0, + "step": 377 + }, + { + "epoch": 0.22433234421364986, + "grad_norm": 0.5145555138587952, + "learning_rate": 7.450592885375494e-07, + "loss": 0.8537, + "mean_token_accuracy": 0.7392063140869141, + "num_tokens": 181357170.0, + "step": 378 + }, + { + "epoch": 0.22492581602373887, + "grad_norm": 0.5531847476959229, + "learning_rate": 7.470355731225296e-07, + "loss": 0.8512, + "mean_token_accuracy": 0.7415304780006409, + "num_tokens": 181865664.0, + "step": 379 + }, + { + "epoch": 0.22551928783382788, + "grad_norm": 0.5892293453216553, + "learning_rate": 7.490118577075099e-07, + "loss": 0.8573, + "mean_token_accuracy": 0.7376997470855713, + "num_tokens": 182314682.0, + "step": 380 + }, + { + "epoch": 0.22611275964391692, + "grad_norm": 0.5920942425727844, + "learning_rate": 7.509881422924901e-07, + "loss": 0.8649, + "mean_token_accuracy": 0.7370414137840271, + "num_tokens": 182777010.0, + "step": 381 + }, + { + "epoch": 0.22670623145400592, + "grad_norm": 0.537815511226654, + "learning_rate": 7.529644268774703e-07, + "loss": 0.8332, + "mean_token_accuracy": 0.7467966079711914, + "num_tokens": 183302678.0, + "step": 382 + }, + { + "epoch": 0.22729970326409496, + "grad_norm": 0.5886869430541992, + "learning_rate": 7.549407114624505e-07, + "loss": 0.8214, + "mean_token_accuracy": 0.7504006624221802, + "num_tokens": 183757739.0, + "step": 383 + }, + { + "epoch": 0.22789317507418397, + "grad_norm": 0.5840797424316406, + "learning_rate": 7.569169960474307e-07, + "loss": 0.8093, + "mean_token_accuracy": 0.7528105974197388, + "num_tokens": 184209585.0, + "step": 384 + }, + { + "epoch": 0.228486646884273, + "grad_norm": 0.5118390917778015, + "learning_rate": 7.58893280632411e-07, + "loss": 0.833, + "mean_token_accuracy": 0.7463947534561157, + "num_tokens": 184747471.0, + "step": 385 + }, + { + "epoch": 0.229080118694362, + "grad_norm": 0.5959621667861938, + "learning_rate": 7.608695652173913e-07, + "loss": 0.8317, + "mean_token_accuracy": 0.7456809282302856, + "num_tokens": 185241060.0, + "step": 386 + }, + { + "epoch": 0.22967359050445105, + "grad_norm": 0.5452444553375244, + "learning_rate": 7.628458498023716e-07, + "loss": 0.7963, + "mean_token_accuracy": 0.7561346888542175, + "num_tokens": 185723800.0, + "step": 387 + }, + { + "epoch": 0.23026706231454006, + "grad_norm": 0.5700525045394897, + "learning_rate": 7.648221343873518e-07, + "loss": 0.8785, + "mean_token_accuracy": 0.7356477975845337, + "num_tokens": 186249004.0, + "step": 388 + }, + { + "epoch": 0.2308605341246291, + "grad_norm": 0.5915219783782959, + "learning_rate": 7.66798418972332e-07, + "loss": 0.8613, + "mean_token_accuracy": 0.7373234033584595, + "num_tokens": 186706570.0, + "step": 389 + }, + { + "epoch": 0.2314540059347181, + "grad_norm": 0.564588725566864, + "learning_rate": 7.687747035573122e-07, + "loss": 0.8803, + "mean_token_accuracy": 0.733102023601532, + "num_tokens": 187179785.0, + "step": 390 + }, + { + "epoch": 0.23204747774480713, + "grad_norm": 0.5354048013687134, + "learning_rate": 7.707509881422924e-07, + "loss": 0.8418, + "mean_token_accuracy": 0.7443498373031616, + "num_tokens": 187707606.0, + "step": 391 + }, + { + "epoch": 0.23264094955489614, + "grad_norm": 0.5543919205665588, + "learning_rate": 7.727272727272727e-07, + "loss": 0.8592, + "mean_token_accuracy": 0.7391021251678467, + "num_tokens": 188167179.0, + "step": 392 + }, + { + "epoch": 0.23323442136498515, + "grad_norm": 0.5615345239639282, + "learning_rate": 7.747035573122529e-07, + "loss": 0.867, + "mean_token_accuracy": 0.7366525530815125, + "num_tokens": 188647197.0, + "step": 393 + }, + { + "epoch": 0.2338278931750742, + "grad_norm": 0.6101566553115845, + "learning_rate": 7.766798418972331e-07, + "loss": 0.8167, + "mean_token_accuracy": 0.7498373985290527, + "num_tokens": 189086477.0, + "step": 394 + }, + { + "epoch": 0.2344213649851632, + "grad_norm": 0.545558750629425, + "learning_rate": 7.786561264822134e-07, + "loss": 0.7948, + "mean_token_accuracy": 0.7560747861862183, + "num_tokens": 189559923.0, + "step": 395 + }, + { + "epoch": 0.23501483679525223, + "grad_norm": 0.5811753869056702, + "learning_rate": 7.806324110671937e-07, + "loss": 0.8071, + "mean_token_accuracy": 0.7524535655975342, + "num_tokens": 190026347.0, + "step": 396 + }, + { + "epoch": 0.23560830860534124, + "grad_norm": 0.5699445009231567, + "learning_rate": 7.826086956521739e-07, + "loss": 0.7871, + "mean_token_accuracy": 0.7550682425498962, + "num_tokens": 190513146.0, + "step": 397 + }, + { + "epoch": 0.23620178041543027, + "grad_norm": 0.5752913355827332, + "learning_rate": 7.845849802371541e-07, + "loss": 0.7848, + "mean_token_accuracy": 0.7584640979766846, + "num_tokens": 190947977.0, + "step": 398 + }, + { + "epoch": 0.23679525222551928, + "grad_norm": 0.5505977272987366, + "learning_rate": 7.865612648221343e-07, + "loss": 0.7862, + "mean_token_accuracy": 0.7559787034988403, + "num_tokens": 191450734.0, + "step": 399 + }, + { + "epoch": 0.23738872403560832, + "grad_norm": 0.5417904257774353, + "learning_rate": 7.885375494071146e-07, + "loss": 0.7814, + "mean_token_accuracy": 0.7609891891479492, + "num_tokens": 191968147.0, + "step": 400 + }, + { + "epoch": 0.23798219584569733, + "grad_norm": 0.5514717698097229, + "learning_rate": 7.905138339920948e-07, + "loss": 0.8438, + "mean_token_accuracy": 0.7421091794967651, + "num_tokens": 192465813.0, + "step": 401 + }, + { + "epoch": 0.23857566765578636, + "grad_norm": 0.6202865242958069, + "learning_rate": 7.92490118577075e-07, + "loss": 0.8578, + "mean_token_accuracy": 0.7391518354415894, + "num_tokens": 192954474.0, + "step": 402 + }, + { + "epoch": 0.23916913946587537, + "grad_norm": 0.5849139094352722, + "learning_rate": 7.944664031620553e-07, + "loss": 0.862, + "mean_token_accuracy": 0.7369478940963745, + "num_tokens": 193403675.0, + "step": 403 + }, + { + "epoch": 0.2397626112759644, + "grad_norm": 0.555288553237915, + "learning_rate": 7.964426877470355e-07, + "loss": 0.7971, + "mean_token_accuracy": 0.7552173137664795, + "num_tokens": 193891579.0, + "step": 404 + }, + { + "epoch": 0.2403560830860534, + "grad_norm": 0.6174401044845581, + "learning_rate": 7.984189723320158e-07, + "loss": 0.8242, + "mean_token_accuracy": 0.7452278137207031, + "num_tokens": 194343958.0, + "step": 405 + }, + { + "epoch": 0.24094955489614242, + "grad_norm": 0.5790275931358337, + "learning_rate": 8.00395256916996e-07, + "loss": 0.8711, + "mean_token_accuracy": 0.7390632629394531, + "num_tokens": 194841318.0, + "step": 406 + }, + { + "epoch": 0.24154302670623146, + "grad_norm": 0.5215025544166565, + "learning_rate": 8.023715415019763e-07, + "loss": 0.8081, + "mean_token_accuracy": 0.7549481987953186, + "num_tokens": 195371762.0, + "step": 407 + }, + { + "epoch": 0.24213649851632046, + "grad_norm": 0.589948296546936, + "learning_rate": 8.043478260869565e-07, + "loss": 0.8486, + "mean_token_accuracy": 0.7410460710525513, + "num_tokens": 195840308.0, + "step": 408 + }, + { + "epoch": 0.2427299703264095, + "grad_norm": 0.584872305393219, + "learning_rate": 8.063241106719367e-07, + "loss": 0.8273, + "mean_token_accuracy": 0.7491552233695984, + "num_tokens": 196301840.0, + "step": 409 + }, + { + "epoch": 0.2433234421364985, + "grad_norm": 0.5461404919624329, + "learning_rate": 8.08300395256917e-07, + "loss": 0.8565, + "mean_token_accuracy": 0.7412887811660767, + "num_tokens": 196792924.0, + "step": 410 + }, + { + "epoch": 0.24391691394658754, + "grad_norm": 0.5712194442749023, + "learning_rate": 8.102766798418972e-07, + "loss": 0.8168, + "mean_token_accuracy": 0.7516362071037292, + "num_tokens": 197278934.0, + "step": 411 + }, + { + "epoch": 0.24451038575667655, + "grad_norm": 0.6069883108139038, + "learning_rate": 8.122529644268774e-07, + "loss": 0.8467, + "mean_token_accuracy": 0.7409995794296265, + "num_tokens": 197729066.0, + "step": 412 + }, + { + "epoch": 0.2451038575667656, + "grad_norm": 0.5481382608413696, + "learning_rate": 8.142292490118576e-07, + "loss": 0.8759, + "mean_token_accuracy": 0.7379264831542969, + "num_tokens": 198243302.0, + "step": 413 + }, + { + "epoch": 0.2456973293768546, + "grad_norm": 0.584554135799408, + "learning_rate": 8.162055335968378e-07, + "loss": 0.8092, + "mean_token_accuracy": 0.7498738169670105, + "num_tokens": 198724269.0, + "step": 414 + }, + { + "epoch": 0.24629080118694363, + "grad_norm": 0.5454171895980835, + "learning_rate": 8.181818181818182e-07, + "loss": 0.813, + "mean_token_accuracy": 0.7492304444313049, + "num_tokens": 199230740.0, + "step": 415 + }, + { + "epoch": 0.24688427299703264, + "grad_norm": 0.5439335107803345, + "learning_rate": 8.201581027667984e-07, + "loss": 0.7907, + "mean_token_accuracy": 0.7554879188537598, + "num_tokens": 199701345.0, + "step": 416 + }, + { + "epoch": 0.24747774480712167, + "grad_norm": 0.5803811550140381, + "learning_rate": 8.221343873517787e-07, + "loss": 0.8372, + "mean_token_accuracy": 0.7440038919448853, + "num_tokens": 200170063.0, + "step": 417 + }, + { + "epoch": 0.24807121661721068, + "grad_norm": 0.5750409364700317, + "learning_rate": 8.241106719367589e-07, + "loss": 0.8511, + "mean_token_accuracy": 0.7417241334915161, + "num_tokens": 200634644.0, + "step": 418 + }, + { + "epoch": 0.2486646884272997, + "grad_norm": 0.5814246535301208, + "learning_rate": 8.260869565217391e-07, + "loss": 0.8561, + "mean_token_accuracy": 0.7397128939628601, + "num_tokens": 201085779.0, + "step": 419 + }, + { + "epoch": 0.24925816023738873, + "grad_norm": 0.5785443782806396, + "learning_rate": 8.280632411067193e-07, + "loss": 0.8674, + "mean_token_accuracy": 0.7360391616821289, + "num_tokens": 201579442.0, + "step": 420 + }, + { + "epoch": 0.24985163204747773, + "grad_norm": 0.5780167579650879, + "learning_rate": 8.300395256916995e-07, + "loss": 0.8329, + "mean_token_accuracy": 0.7457127571105957, + "num_tokens": 202007516.0, + "step": 421 + }, + { + "epoch": 0.25044510385756674, + "grad_norm": 0.538483738899231, + "learning_rate": 8.320158102766798e-07, + "loss": 0.8739, + "mean_token_accuracy": 0.7361444234848022, + "num_tokens": 202491981.0, + "step": 422 + }, + { + "epoch": 0.2510385756676558, + "grad_norm": 0.5869424939155579, + "learning_rate": 8.3399209486166e-07, + "loss": 0.8352, + "mean_token_accuracy": 0.7448616027832031, + "num_tokens": 203000560.0, + "step": 423 + }, + { + "epoch": 0.2516320474777448, + "grad_norm": 0.5946015119552612, + "learning_rate": 8.359683794466402e-07, + "loss": 0.8268, + "mean_token_accuracy": 0.7469689846038818, + "num_tokens": 203431774.0, + "step": 424 + }, + { + "epoch": 0.2522255192878338, + "grad_norm": 0.5669561624526978, + "learning_rate": 8.379446640316206e-07, + "loss": 0.8518, + "mean_token_accuracy": 0.7388594150543213, + "num_tokens": 203906875.0, + "step": 425 + }, + { + "epoch": 0.25281899109792283, + "grad_norm": 0.5556800961494446, + "learning_rate": 8.399209486166008e-07, + "loss": 0.8305, + "mean_token_accuracy": 0.746029257774353, + "num_tokens": 204386920.0, + "step": 426 + }, + { + "epoch": 0.2534124629080119, + "grad_norm": 0.5697214007377625, + "learning_rate": 8.41897233201581e-07, + "loss": 0.8001, + "mean_token_accuracy": 0.7530815601348877, + "num_tokens": 204864439.0, + "step": 427 + }, + { + "epoch": 0.2540059347181009, + "grad_norm": 0.5256825089454651, + "learning_rate": 8.438735177865612e-07, + "loss": 0.8294, + "mean_token_accuracy": 0.7466500997543335, + "num_tokens": 205375222.0, + "step": 428 + }, + { + "epoch": 0.2545994065281899, + "grad_norm": 0.5571102499961853, + "learning_rate": 8.458498023715415e-07, + "loss": 0.8614, + "mean_token_accuracy": 0.7374507188796997, + "num_tokens": 205884707.0, + "step": 429 + }, + { + "epoch": 0.2551928783382789, + "grad_norm": 0.6016830801963806, + "learning_rate": 8.478260869565217e-07, + "loss": 0.827, + "mean_token_accuracy": 0.7480778694152832, + "num_tokens": 206364428.0, + "step": 430 + }, + { + "epoch": 0.255786350148368, + "grad_norm": 0.531629204750061, + "learning_rate": 8.498023715415019e-07, + "loss": 0.8081, + "mean_token_accuracy": 0.7519898414611816, + "num_tokens": 206877911.0, + "step": 431 + }, + { + "epoch": 0.256379821958457, + "grad_norm": 0.5726255774497986, + "learning_rate": 8.517786561264822e-07, + "loss": 0.7945, + "mean_token_accuracy": 0.7584315538406372, + "num_tokens": 207372706.0, + "step": 432 + }, + { + "epoch": 0.256973293768546, + "grad_norm": 0.5476090312004089, + "learning_rate": 8.537549407114624e-07, + "loss": 0.8092, + "mean_token_accuracy": 0.7525683045387268, + "num_tokens": 207848253.0, + "step": 433 + }, + { + "epoch": 0.257566765578635, + "grad_norm": 0.5843583345413208, + "learning_rate": 8.557312252964426e-07, + "loss": 0.8711, + "mean_token_accuracy": 0.7335138320922852, + "num_tokens": 208289852.0, + "step": 434 + }, + { + "epoch": 0.258160237388724, + "grad_norm": 0.5407089591026306, + "learning_rate": 8.577075098814229e-07, + "loss": 0.7894, + "mean_token_accuracy": 0.758036732673645, + "num_tokens": 208764080.0, + "step": 435 + }, + { + "epoch": 0.2587537091988131, + "grad_norm": 0.5688757300376892, + "learning_rate": 8.596837944664031e-07, + "loss": 0.8492, + "mean_token_accuracy": 0.741197943687439, + "num_tokens": 209284204.0, + "step": 436 + }, + { + "epoch": 0.2593471810089021, + "grad_norm": 0.5285140872001648, + "learning_rate": 8.616600790513834e-07, + "loss": 0.8722, + "mean_token_accuracy": 0.7377383708953857, + "num_tokens": 209788910.0, + "step": 437 + }, + { + "epoch": 0.2599406528189911, + "grad_norm": 0.5688947439193726, + "learning_rate": 8.636363636363636e-07, + "loss": 0.8628, + "mean_token_accuracy": 0.7381122708320618, + "num_tokens": 210265775.0, + "step": 438 + }, + { + "epoch": 0.2605341246290801, + "grad_norm": 0.5742479562759399, + "learning_rate": 8.656126482213438e-07, + "loss": 0.8038, + "mean_token_accuracy": 0.7504374980926514, + "num_tokens": 210723975.0, + "step": 439 + }, + { + "epoch": 0.26112759643916916, + "grad_norm": 0.5627344250679016, + "learning_rate": 8.675889328063241e-07, + "loss": 0.8666, + "mean_token_accuracy": 0.7353873252868652, + "num_tokens": 211198119.0, + "step": 440 + }, + { + "epoch": 0.26172106824925817, + "grad_norm": 0.5264517068862915, + "learning_rate": 8.695652173913043e-07, + "loss": 0.8084, + "mean_token_accuracy": 0.7511667013168335, + "num_tokens": 211687889.0, + "step": 441 + }, + { + "epoch": 0.2623145400593472, + "grad_norm": 0.5388584136962891, + "learning_rate": 8.715415019762845e-07, + "loss": 0.8221, + "mean_token_accuracy": 0.7498415112495422, + "num_tokens": 212187778.0, + "step": 442 + }, + { + "epoch": 0.2629080118694362, + "grad_norm": 0.5463522672653198, + "learning_rate": 8.735177865612647e-07, + "loss": 0.8224, + "mean_token_accuracy": 0.7460856437683105, + "num_tokens": 212687607.0, + "step": 443 + }, + { + "epoch": 0.26350148367952525, + "grad_norm": 0.5483478307723999, + "learning_rate": 8.754940711462451e-07, + "loss": 0.846, + "mean_token_accuracy": 0.7404451370239258, + "num_tokens": 213159238.0, + "step": 444 + }, + { + "epoch": 0.26409495548961426, + "grad_norm": 0.5564917325973511, + "learning_rate": 8.774703557312253e-07, + "loss": 0.8188, + "mean_token_accuracy": 0.7520537376403809, + "num_tokens": 213602612.0, + "step": 445 + }, + { + "epoch": 0.26468842729970327, + "grad_norm": 0.5886843204498291, + "learning_rate": 8.794466403162055e-07, + "loss": 0.8281, + "mean_token_accuracy": 0.7453309297561646, + "num_tokens": 214038972.0, + "step": 446 + }, + { + "epoch": 0.2652818991097923, + "grad_norm": 0.5816295742988586, + "learning_rate": 8.814229249011858e-07, + "loss": 0.8348, + "mean_token_accuracy": 0.7482882738113403, + "num_tokens": 214511110.0, + "step": 447 + }, + { + "epoch": 0.2658753709198813, + "grad_norm": 0.5597276091575623, + "learning_rate": 8.83399209486166e-07, + "loss": 0.8085, + "mean_token_accuracy": 0.7507785558700562, + "num_tokens": 214977278.0, + "step": 448 + }, + { + "epoch": 0.26646884272997035, + "grad_norm": 0.5775185823440552, + "learning_rate": 8.853754940711462e-07, + "loss": 0.8333, + "mean_token_accuracy": 0.7465658187866211, + "num_tokens": 215449236.0, + "step": 449 + }, + { + "epoch": 0.26706231454005935, + "grad_norm": 0.5695748925209045, + "learning_rate": 8.873517786561264e-07, + "loss": 0.836, + "mean_token_accuracy": 0.746168851852417, + "num_tokens": 215934404.0, + "step": 450 + }, + { + "epoch": 0.26765578635014836, + "grad_norm": 0.5544785857200623, + "learning_rate": 8.893280632411066e-07, + "loss": 0.8289, + "mean_token_accuracy": 0.7464814186096191, + "num_tokens": 216400218.0, + "step": 451 + }, + { + "epoch": 0.26824925816023737, + "grad_norm": 0.5527375340461731, + "learning_rate": 8.913043478260869e-07, + "loss": 0.7941, + "mean_token_accuracy": 0.7529213428497314, + "num_tokens": 216888559.0, + "step": 452 + }, + { + "epoch": 0.26884272997032643, + "grad_norm": 0.540268063545227, + "learning_rate": 8.932806324110671e-07, + "loss": 0.8274, + "mean_token_accuracy": 0.7471795082092285, + "num_tokens": 217434680.0, + "step": 453 + }, + { + "epoch": 0.26943620178041544, + "grad_norm": 0.5395182967185974, + "learning_rate": 8.952569169960475e-07, + "loss": 0.8285, + "mean_token_accuracy": 0.7454962730407715, + "num_tokens": 217963918.0, + "step": 454 + }, + { + "epoch": 0.27002967359050445, + "grad_norm": 0.5588404536247253, + "learning_rate": 8.972332015810277e-07, + "loss": 0.8554, + "mean_token_accuracy": 0.7392786145210266, + "num_tokens": 218512537.0, + "step": 455 + }, + { + "epoch": 0.27062314540059346, + "grad_norm": 0.5678071975708008, + "learning_rate": 8.992094861660079e-07, + "loss": 0.8023, + "mean_token_accuracy": 0.7519606351852417, + "num_tokens": 218987867.0, + "step": 456 + }, + { + "epoch": 0.2712166172106825, + "grad_norm": 0.6112318634986877, + "learning_rate": 9.011857707509881e-07, + "loss": 0.8281, + "mean_token_accuracy": 0.7467029690742493, + "num_tokens": 219447770.0, + "step": 457 + }, + { + "epoch": 0.27181008902077153, + "grad_norm": 0.5851930975914001, + "learning_rate": 9.031620553359683e-07, + "loss": 0.831, + "mean_token_accuracy": 0.7472394108772278, + "num_tokens": 219948690.0, + "step": 458 + }, + { + "epoch": 0.27240356083086054, + "grad_norm": 0.5416722893714905, + "learning_rate": 9.051383399209486e-07, + "loss": 0.8557, + "mean_token_accuracy": 0.7403098344802856, + "num_tokens": 220438530.0, + "step": 459 + }, + { + "epoch": 0.27299703264094954, + "grad_norm": 0.5722333192825317, + "learning_rate": 9.071146245059288e-07, + "loss": 0.8503, + "mean_token_accuracy": 0.7436543107032776, + "num_tokens": 220937753.0, + "step": 460 + }, + { + "epoch": 0.27359050445103855, + "grad_norm": 0.5569276809692383, + "learning_rate": 9.09090909090909e-07, + "loss": 0.7997, + "mean_token_accuracy": 0.7536998987197876, + "num_tokens": 221455725.0, + "step": 461 + }, + { + "epoch": 0.2741839762611276, + "grad_norm": 0.5667770504951477, + "learning_rate": 9.110671936758893e-07, + "loss": 0.7946, + "mean_token_accuracy": 0.7551641464233398, + "num_tokens": 221914625.0, + "step": 462 + }, + { + "epoch": 0.2747774480712166, + "grad_norm": 0.5660532116889954, + "learning_rate": 9.130434782608695e-07, + "loss": 0.8615, + "mean_token_accuracy": 0.7397037744522095, + "num_tokens": 222422600.0, + "step": 463 + }, + { + "epoch": 0.27537091988130563, + "grad_norm": 0.5751998424530029, + "learning_rate": 9.150197628458498e-07, + "loss": 0.8351, + "mean_token_accuracy": 0.7427976131439209, + "num_tokens": 222935852.0, + "step": 464 + }, + { + "epoch": 0.27596439169139464, + "grad_norm": 0.532718300819397, + "learning_rate": 9.1699604743083e-07, + "loss": 0.8313, + "mean_token_accuracy": 0.7469978928565979, + "num_tokens": 223426170.0, + "step": 465 + }, + { + "epoch": 0.2765578635014837, + "grad_norm": 0.5836936235427856, + "learning_rate": 9.189723320158103e-07, + "loss": 0.7816, + "mean_token_accuracy": 0.7599872350692749, + "num_tokens": 223925354.0, + "step": 466 + }, + { + "epoch": 0.2771513353115727, + "grad_norm": 0.5895096659660339, + "learning_rate": 9.209486166007905e-07, + "loss": 0.8192, + "mean_token_accuracy": 0.749151349067688, + "num_tokens": 224395092.0, + "step": 467 + }, + { + "epoch": 0.2777448071216617, + "grad_norm": 0.5938785076141357, + "learning_rate": 9.229249011857707e-07, + "loss": 0.8128, + "mean_token_accuracy": 0.750018298625946, + "num_tokens": 224852048.0, + "step": 468 + }, + { + "epoch": 0.2783382789317507, + "grad_norm": 0.613647997379303, + "learning_rate": 9.24901185770751e-07, + "loss": 0.8105, + "mean_token_accuracy": 0.7508031725883484, + "num_tokens": 225296563.0, + "step": 469 + }, + { + "epoch": 0.2789317507418398, + "grad_norm": 0.6147931814193726, + "learning_rate": 9.268774703557312e-07, + "loss": 0.8486, + "mean_token_accuracy": 0.737163782119751, + "num_tokens": 225715337.0, + "step": 470 + }, + { + "epoch": 0.2795252225519288, + "grad_norm": 0.5400534272193909, + "learning_rate": 9.288537549407114e-07, + "loss": 0.7865, + "mean_token_accuracy": 0.7577413320541382, + "num_tokens": 226229617.0, + "step": 471 + }, + { + "epoch": 0.2801186943620178, + "grad_norm": 0.6265953779220581, + "learning_rate": 9.308300395256916e-07, + "loss": 0.8462, + "mean_token_accuracy": 0.7414346933364868, + "num_tokens": 226677952.0, + "step": 472 + }, + { + "epoch": 0.2807121661721068, + "grad_norm": 0.5731704831123352, + "learning_rate": 9.328063241106719e-07, + "loss": 0.7997, + "mean_token_accuracy": 0.7550766468048096, + "num_tokens": 227154210.0, + "step": 473 + }, + { + "epoch": 0.2813056379821958, + "grad_norm": 0.6032576560974121, + "learning_rate": 9.347826086956522e-07, + "loss": 0.836, + "mean_token_accuracy": 0.7433600425720215, + "num_tokens": 227641623.0, + "step": 474 + }, + { + "epoch": 0.2818991097922849, + "grad_norm": 0.5951387286186218, + "learning_rate": 9.367588932806324e-07, + "loss": 0.803, + "mean_token_accuracy": 0.7543395757675171, + "num_tokens": 228101188.0, + "step": 475 + }, + { + "epoch": 0.2824925816023739, + "grad_norm": 0.5698041319847107, + "learning_rate": 9.387351778656126e-07, + "loss": 0.8667, + "mean_token_accuracy": 0.7369846105575562, + "num_tokens": 228569891.0, + "step": 476 + }, + { + "epoch": 0.2830860534124629, + "grad_norm": 0.5400965213775635, + "learning_rate": 9.407114624505929e-07, + "loss": 0.7874, + "mean_token_accuracy": 0.7568495273590088, + "num_tokens": 229068356.0, + "step": 477 + }, + { + "epoch": 0.2836795252225519, + "grad_norm": 0.5471845269203186, + "learning_rate": 9.426877470355731e-07, + "loss": 0.8299, + "mean_token_accuracy": 0.7459166049957275, + "num_tokens": 229576271.0, + "step": 478 + }, + { + "epoch": 0.284272997032641, + "grad_norm": 0.5634534358978271, + "learning_rate": 9.446640316205533e-07, + "loss": 0.8597, + "mean_token_accuracy": 0.7392269372940063, + "num_tokens": 230052614.0, + "step": 479 + }, + { + "epoch": 0.28486646884273, + "grad_norm": 0.5530499219894409, + "learning_rate": 9.466403162055335e-07, + "loss": 0.8355, + "mean_token_accuracy": 0.7448794841766357, + "num_tokens": 230521966.0, + "step": 480 + }, + { + "epoch": 0.285459940652819, + "grad_norm": 0.5901499390602112, + "learning_rate": 9.486166007905137e-07, + "loss": 0.8678, + "mean_token_accuracy": 0.7363789081573486, + "num_tokens": 230992588.0, + "step": 481 + }, + { + "epoch": 0.286053412462908, + "grad_norm": 0.5539312958717346, + "learning_rate": 9.50592885375494e-07, + "loss": 0.7964, + "mean_token_accuracy": 0.7532342672348022, + "num_tokens": 231436683.0, + "step": 482 + }, + { + "epoch": 0.28664688427299706, + "grad_norm": 0.5502335429191589, + "learning_rate": 9.525691699604743e-07, + "loss": 0.8728, + "mean_token_accuracy": 0.7358449697494507, + "num_tokens": 231918355.0, + "step": 483 + }, + { + "epoch": 0.28724035608308607, + "grad_norm": 0.5855311155319214, + "learning_rate": 9.545454545454546e-07, + "loss": 0.8169, + "mean_token_accuracy": 0.748111367225647, + "num_tokens": 232385157.0, + "step": 484 + }, + { + "epoch": 0.2878338278931751, + "grad_norm": 0.5835723280906677, + "learning_rate": 9.565217391304349e-07, + "loss": 0.84, + "mean_token_accuracy": 0.7433302402496338, + "num_tokens": 232825068.0, + "step": 485 + }, + { + "epoch": 0.2884272997032641, + "grad_norm": 0.5629599094390869, + "learning_rate": 9.58498023715415e-07, + "loss": 0.799, + "mean_token_accuracy": 0.7521035671234131, + "num_tokens": 233285875.0, + "step": 486 + }, + { + "epoch": 0.2890207715133531, + "grad_norm": 0.5306742787361145, + "learning_rate": 9.604743083003953e-07, + "loss": 0.8364, + "mean_token_accuracy": 0.7459594011306763, + "num_tokens": 233800874.0, + "step": 487 + }, + { + "epoch": 0.28961424332344216, + "grad_norm": 0.5675927996635437, + "learning_rate": 9.624505928853754e-07, + "loss": 0.7998, + "mean_token_accuracy": 0.7538193464279175, + "num_tokens": 234290076.0, + "step": 488 + }, + { + "epoch": 0.29020771513353116, + "grad_norm": 0.5592665672302246, + "learning_rate": 9.644268774703557e-07, + "loss": 0.8723, + "mean_token_accuracy": 0.735554039478302, + "num_tokens": 234792469.0, + "step": 489 + }, + { + "epoch": 0.29080118694362017, + "grad_norm": 0.5579455494880676, + "learning_rate": 9.664031620553358e-07, + "loss": 0.7865, + "mean_token_accuracy": 0.7590001225471497, + "num_tokens": 235253866.0, + "step": 490 + }, + { + "epoch": 0.2913946587537092, + "grad_norm": 0.5726102590560913, + "learning_rate": 9.683794466403161e-07, + "loss": 0.8305, + "mean_token_accuracy": 0.7448549866676331, + "num_tokens": 235718767.0, + "step": 491 + }, + { + "epoch": 0.29198813056379824, + "grad_norm": 0.5583921670913696, + "learning_rate": 9.703557312252962e-07, + "loss": 0.7622, + "mean_token_accuracy": 0.762686014175415, + "num_tokens": 236198722.0, + "step": 492 + }, + { + "epoch": 0.29258160237388725, + "grad_norm": 0.5501416325569153, + "learning_rate": 9.723320158102768e-07, + "loss": 0.7748, + "mean_token_accuracy": 0.7597334384918213, + "num_tokens": 236718768.0, + "step": 493 + }, + { + "epoch": 0.29317507418397626, + "grad_norm": 0.5898497104644775, + "learning_rate": 9.743083003952569e-07, + "loss": 0.8396, + "mean_token_accuracy": 0.7422404289245605, + "num_tokens": 237168899.0, + "step": 494 + }, + { + "epoch": 0.29376854599406527, + "grad_norm": 0.5905404686927795, + "learning_rate": 9.762845849802372e-07, + "loss": 0.7983, + "mean_token_accuracy": 0.7573241591453552, + "num_tokens": 237638530.0, + "step": 495 + }, + { + "epoch": 0.29436201780415433, + "grad_norm": 0.5515053272247314, + "learning_rate": 9.782608695652173e-07, + "loss": 0.8109, + "mean_token_accuracy": 0.7515136003494263, + "num_tokens": 238119781.0, + "step": 496 + }, + { + "epoch": 0.29495548961424334, + "grad_norm": 0.5783311128616333, + "learning_rate": 9.802371541501976e-07, + "loss": 0.822, + "mean_token_accuracy": 0.7495622634887695, + "num_tokens": 238640502.0, + "step": 497 + }, + { + "epoch": 0.29554896142433235, + "grad_norm": 0.5716015696525574, + "learning_rate": 9.822134387351777e-07, + "loss": 0.8999, + "mean_token_accuracy": 0.7267526984214783, + "num_tokens": 239108097.0, + "step": 498 + }, + { + "epoch": 0.29614243323442135, + "grad_norm": 0.5571446418762207, + "learning_rate": 9.84189723320158e-07, + "loss": 0.8777, + "mean_token_accuracy": 0.7333240509033203, + "num_tokens": 239585920.0, + "step": 499 + }, + { + "epoch": 0.29673590504451036, + "grad_norm": 0.5982879400253296, + "learning_rate": 9.861660079051384e-07, + "loss": 0.7839, + "mean_token_accuracy": 0.7555792927742004, + "num_tokens": 240042193.0, + "step": 500 + }, + { + "epoch": 0.2973293768545994, + "grad_norm": 0.575635552406311, + "learning_rate": 9.881422924901185e-07, + "loss": 0.8388, + "mean_token_accuracy": 0.7422996759414673, + "num_tokens": 240510191.0, + "step": 501 + }, + { + "epoch": 0.29792284866468843, + "grad_norm": 0.5727033019065857, + "learning_rate": 9.901185770750988e-07, + "loss": 0.8578, + "mean_token_accuracy": 0.7378767728805542, + "num_tokens": 240993195.0, + "step": 502 + }, + { + "epoch": 0.29851632047477744, + "grad_norm": 0.5541064143180847, + "learning_rate": 9.920948616600791e-07, + "loss": 0.791, + "mean_token_accuracy": 0.7556321024894714, + "num_tokens": 241443725.0, + "step": 503 + }, + { + "epoch": 0.29910979228486645, + "grad_norm": 0.5725215673446655, + "learning_rate": 9.940711462450592e-07, + "loss": 0.8199, + "mean_token_accuracy": 0.7480305433273315, + "num_tokens": 241910832.0, + "step": 504 + }, + { + "epoch": 0.2997032640949555, + "grad_norm": 0.5382090210914612, + "learning_rate": 9.960474308300395e-07, + "loss": 0.8387, + "mean_token_accuracy": 0.7432488799095154, + "num_tokens": 242440641.0, + "step": 505 + }, + { + "epoch": 0.3002967359050445, + "grad_norm": 1.1616075038909912, + "learning_rate": 9.980237154150196e-07, + "loss": 0.8306, + "mean_token_accuracy": 0.7467166781425476, + "num_tokens": 242955712.0, + "step": 506 + }, + { + "epoch": 0.30089020771513353, + "grad_norm": 0.5430070757865906, + "learning_rate": 1e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7561200857162476, + "num_tokens": 243442096.0, + "step": 507 + }, + { + "epoch": 0.30148367952522254, + "grad_norm": 0.5607202649116516, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7489442825317383, + "num_tokens": 243908877.0, + "step": 508 + }, + { + "epoch": 0.3020771513353116, + "grad_norm": 0.5843808054924011, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.744725227355957, + "num_tokens": 244371009.0, + "step": 509 + }, + { + "epoch": 0.3026706231454006, + "grad_norm": 0.5298656225204468, + "learning_rate": 1e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7601484060287476, + "num_tokens": 244905721.0, + "step": 510 + }, + { + "epoch": 0.3032640949554896, + "grad_norm": 0.5650370121002197, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7469621896743774, + "num_tokens": 245387157.0, + "step": 511 + }, + { + "epoch": 0.3038575667655786, + "grad_norm": 0.5610879063606262, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7434603571891785, + "num_tokens": 245859563.0, + "step": 512 + }, + { + "epoch": 0.30445103857566763, + "grad_norm": 0.5230905413627625, + "learning_rate": 1e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.752048909664154, + "num_tokens": 246388321.0, + "step": 513 + }, + { + "epoch": 0.3050445103857567, + "grad_norm": 0.5666635632514954, + "learning_rate": 1e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7532026767730713, + "num_tokens": 246851485.0, + "step": 514 + }, + { + "epoch": 0.3056379821958457, + "grad_norm": 0.5575776100158691, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7387830018997192, + "num_tokens": 247312695.0, + "step": 515 + }, + { + "epoch": 0.3062314540059347, + "grad_norm": 0.5411115884780884, + "learning_rate": 1e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7521548271179199, + "num_tokens": 247812609.0, + "step": 516 + }, + { + "epoch": 0.3068249258160237, + "grad_norm": 0.5780379772186279, + "learning_rate": 1e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7540773153305054, + "num_tokens": 248270310.0, + "step": 517 + }, + { + "epoch": 0.3074183976261128, + "grad_norm": 0.5176886320114136, + "learning_rate": 1e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7623218297958374, + "num_tokens": 248780436.0, + "step": 518 + }, + { + "epoch": 0.3080118694362018, + "grad_norm": 0.5998214483261108, + "learning_rate": 1e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7455215454101562, + "num_tokens": 249221168.0, + "step": 519 + }, + { + "epoch": 0.3086053412462908, + "grad_norm": 0.5532945990562439, + "learning_rate": 1e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7582194805145264, + "num_tokens": 249713276.0, + "step": 520 + }, + { + "epoch": 0.3091988130563798, + "grad_norm": 0.5532928109169006, + "learning_rate": 1e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.755986213684082, + "num_tokens": 250187836.0, + "step": 521 + }, + { + "epoch": 0.30979228486646887, + "grad_norm": 0.5560587644577026, + "learning_rate": 1e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7564797401428223, + "num_tokens": 250672394.0, + "step": 522 + }, + { + "epoch": 0.3103857566765579, + "grad_norm": 0.554141104221344, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7362441420555115, + "num_tokens": 251169081.0, + "step": 523 + }, + { + "epoch": 0.3109792284866469, + "grad_norm": 0.5309992432594299, + "learning_rate": 1e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7532570958137512, + "num_tokens": 251664323.0, + "step": 524 + }, + { + "epoch": 0.3115727002967359, + "grad_norm": 0.6159276962280273, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7362700700759888, + "num_tokens": 252148735.0, + "step": 525 + }, + { + "epoch": 0.3121661721068249, + "grad_norm": 0.543459951877594, + "learning_rate": 1e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7519946694374084, + "num_tokens": 252683501.0, + "step": 526 + }, + { + "epoch": 0.31275964391691397, + "grad_norm": 0.5526793599128723, + "learning_rate": 1e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.74806809425354, + "num_tokens": 253151770.0, + "step": 527 + }, + { + "epoch": 0.313353115727003, + "grad_norm": 0.6390369534492493, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7421022653579712, + "num_tokens": 253634426.0, + "step": 528 + }, + { + "epoch": 0.313946587537092, + "grad_norm": 0.5919458270072937, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7448492050170898, + "num_tokens": 254074071.0, + "step": 529 + }, + { + "epoch": 0.314540059347181, + "grad_norm": 0.529263436794281, + "learning_rate": 1e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7513593435287476, + "num_tokens": 254551365.0, + "step": 530 + }, + { + "epoch": 0.31513353115727005, + "grad_norm": 0.5262753367424011, + "learning_rate": 1e-06, + "loss": 0.814, + "mean_token_accuracy": 0.7513121366500854, + "num_tokens": 255095150.0, + "step": 531 + }, + { + "epoch": 0.31572700296735906, + "grad_norm": 0.5911763906478882, + "learning_rate": 1e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7471746802330017, + "num_tokens": 255557470.0, + "step": 532 + }, + { + "epoch": 0.31632047477744807, + "grad_norm": 0.5653179883956909, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7514071464538574, + "num_tokens": 256017094.0, + "step": 533 + }, + { + "epoch": 0.3169139465875371, + "grad_norm": 0.5822415351867676, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7541580200195312, + "num_tokens": 256480754.0, + "step": 534 + }, + { + "epoch": 0.31750741839762614, + "grad_norm": 0.5648320913314819, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7489820718765259, + "num_tokens": 256963085.0, + "step": 535 + }, + { + "epoch": 0.31810089020771515, + "grad_norm": 0.5581892728805542, + "learning_rate": 1e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7521771192550659, + "num_tokens": 257456843.0, + "step": 536 + }, + { + "epoch": 0.31869436201780416, + "grad_norm": 0.5829698443412781, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7466498613357544, + "num_tokens": 257952000.0, + "step": 537 + }, + { + "epoch": 0.31928783382789316, + "grad_norm": 0.5899885892868042, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7422218322753906, + "num_tokens": 258394630.0, + "step": 538 + }, + { + "epoch": 0.31988130563798217, + "grad_norm": 0.5753250122070312, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7440284490585327, + "num_tokens": 258862503.0, + "step": 539 + }, + { + "epoch": 0.32047477744807124, + "grad_norm": 0.5555858612060547, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.759975790977478, + "num_tokens": 259370862.0, + "step": 540 + }, + { + "epoch": 0.32106824925816024, + "grad_norm": 0.5473886132240295, + "learning_rate": 1e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7582173943519592, + "num_tokens": 259835987.0, + "step": 541 + }, + { + "epoch": 0.32166172106824925, + "grad_norm": 0.5619679689407349, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7467259764671326, + "num_tokens": 260326723.0, + "step": 542 + }, + { + "epoch": 0.32225519287833826, + "grad_norm": 0.5793821811676025, + "learning_rate": 1e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7526777982711792, + "num_tokens": 260795313.0, + "step": 543 + }, + { + "epoch": 0.3228486646884273, + "grad_norm": 0.551350474357605, + "learning_rate": 1e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7528319954872131, + "num_tokens": 261316830.0, + "step": 544 + }, + { + "epoch": 0.32344213649851633, + "grad_norm": 0.5634416341781616, + "learning_rate": 1e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7556350827217102, + "num_tokens": 261804970.0, + "step": 545 + }, + { + "epoch": 0.32403560830860534, + "grad_norm": 0.549149215221405, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7413468360900879, + "num_tokens": 262337260.0, + "step": 546 + }, + { + "epoch": 0.32462908011869435, + "grad_norm": 0.5660499334335327, + "learning_rate": 1e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7538917660713196, + "num_tokens": 262798830.0, + "step": 547 + }, + { + "epoch": 0.3252225519287834, + "grad_norm": 0.5454797744750977, + "learning_rate": 1e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7612217664718628, + "num_tokens": 263294102.0, + "step": 548 + }, + { + "epoch": 0.3258160237388724, + "grad_norm": 0.5469000935554504, + "learning_rate": 1e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7569522857666016, + "num_tokens": 263772896.0, + "step": 549 + }, + { + "epoch": 0.3264094955489614, + "grad_norm": 0.5953837633132935, + "learning_rate": 1e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7496882677078247, + "num_tokens": 264217558.0, + "step": 550 + }, + { + "epoch": 0.32700296735905043, + "grad_norm": 0.559464693069458, + "learning_rate": 1e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7493985891342163, + "num_tokens": 264722133.0, + "step": 551 + }, + { + "epoch": 0.32759643916913944, + "grad_norm": 0.5333753228187561, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7414050698280334, + "num_tokens": 265212368.0, + "step": 552 + }, + { + "epoch": 0.3281899109792285, + "grad_norm": 0.5708307027816772, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7427030801773071, + "num_tokens": 265714207.0, + "step": 553 + }, + { + "epoch": 0.3287833827893175, + "grad_norm": 0.5947666764259338, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7592661380767822, + "num_tokens": 266196163.0, + "step": 554 + }, + { + "epoch": 0.3293768545994065, + "grad_norm": 0.5479994416236877, + "learning_rate": 1e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7680354118347168, + "num_tokens": 266707724.0, + "step": 555 + }, + { + "epoch": 0.32997032640949553, + "grad_norm": 0.5648009777069092, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7485061883926392, + "num_tokens": 267188716.0, + "step": 556 + }, + { + "epoch": 0.3305637982195846, + "grad_norm": 0.5802651047706604, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7564350962638855, + "num_tokens": 267650014.0, + "step": 557 + }, + { + "epoch": 0.3311572700296736, + "grad_norm": 0.5678566098213196, + "learning_rate": 1e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7660999298095703, + "num_tokens": 268140074.0, + "step": 558 + }, + { + "epoch": 0.3317507418397626, + "grad_norm": 0.5763184428215027, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7662832736968994, + "num_tokens": 268614303.0, + "step": 559 + }, + { + "epoch": 0.3323442136498516, + "grad_norm": 0.550287127494812, + "learning_rate": 1e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7541336417198181, + "num_tokens": 269125069.0, + "step": 560 + }, + { + "epoch": 0.3329376854599407, + "grad_norm": 0.559655487537384, + "learning_rate": 1e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7600067853927612, + "num_tokens": 269614785.0, + "step": 561 + }, + { + "epoch": 0.3335311572700297, + "grad_norm": 0.576621949672699, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7311983108520508, + "num_tokens": 270121881.0, + "step": 562 + }, + { + "epoch": 0.3341246290801187, + "grad_norm": 0.5433235168457031, + "learning_rate": 1e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7556544542312622, + "num_tokens": 270601478.0, + "step": 563 + }, + { + "epoch": 0.3347181008902077, + "grad_norm": 0.5555203557014465, + "learning_rate": 1e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7475922703742981, + "num_tokens": 271064955.0, + "step": 564 + }, + { + "epoch": 0.3353115727002967, + "grad_norm": 0.5958123803138733, + "learning_rate": 1e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7602030038833618, + "num_tokens": 271526918.0, + "step": 565 + }, + { + "epoch": 0.3359050445103858, + "grad_norm": 0.6042894124984741, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7444372773170471, + "num_tokens": 271993232.0, + "step": 566 + }, + { + "epoch": 0.3364985163204748, + "grad_norm": 0.5609709024429321, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7715657949447632, + "num_tokens": 272443616.0, + "step": 567 + }, + { + "epoch": 0.3370919881305638, + "grad_norm": 0.5613376498222351, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7602491974830627, + "num_tokens": 272901026.0, + "step": 568 + }, + { + "epoch": 0.3376854599406528, + "grad_norm": 0.6105250716209412, + "learning_rate": 1e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7484251260757446, + "num_tokens": 273359566.0, + "step": 569 + }, + { + "epoch": 0.33827893175074186, + "grad_norm": 0.5981996059417725, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7577224969863892, + "num_tokens": 273827982.0, + "step": 570 + }, + { + "epoch": 0.33887240356083087, + "grad_norm": 0.5505020022392273, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7520736455917358, + "num_tokens": 274306091.0, + "step": 571 + }, + { + "epoch": 0.3394658753709199, + "grad_norm": 0.5525766015052795, + "learning_rate": 1e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7475444078445435, + "num_tokens": 274789916.0, + "step": 572 + }, + { + "epoch": 0.3400593471810089, + "grad_norm": 0.5683265328407288, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7365883588790894, + "num_tokens": 275281796.0, + "step": 573 + }, + { + "epoch": 0.34065281899109795, + "grad_norm": 0.5623670816421509, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7472875118255615, + "num_tokens": 275729525.0, + "step": 574 + }, + { + "epoch": 0.34124629080118696, + "grad_norm": 0.5337696075439453, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7686553597450256, + "num_tokens": 276212736.0, + "step": 575 + }, + { + "epoch": 0.34183976261127597, + "grad_norm": 0.5469380021095276, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.735283374786377, + "num_tokens": 276706491.0, + "step": 576 + }, + { + "epoch": 0.342433234421365, + "grad_norm": 0.5308003425598145, + "learning_rate": 1e-06, + "loss": 0.795, + "mean_token_accuracy": 0.753704845905304, + "num_tokens": 277191194.0, + "step": 577 + }, + { + "epoch": 0.343026706231454, + "grad_norm": 0.5558704733848572, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7455588579177856, + "num_tokens": 277716275.0, + "step": 578 + }, + { + "epoch": 0.34362017804154305, + "grad_norm": 0.5987065434455872, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7315919399261475, + "num_tokens": 278174572.0, + "step": 579 + }, + { + "epoch": 0.34421364985163205, + "grad_norm": 0.5286816954612732, + "learning_rate": 1e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.748740017414093, + "num_tokens": 278684362.0, + "step": 580 + }, + { + "epoch": 0.34480712166172106, + "grad_norm": 0.5136172771453857, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7443352937698364, + "num_tokens": 279202812.0, + "step": 581 + }, + { + "epoch": 0.34540059347181007, + "grad_norm": 0.5910993814468384, + "learning_rate": 1e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7503370046615601, + "num_tokens": 279677592.0, + "step": 582 + }, + { + "epoch": 0.34599406528189913, + "grad_norm": 0.5451798439025879, + "learning_rate": 1e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7563173770904541, + "num_tokens": 280161784.0, + "step": 583 + }, + { + "epoch": 0.34658753709198814, + "grad_norm": 0.575888454914093, + "learning_rate": 1e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7627806663513184, + "num_tokens": 280653927.0, + "step": 584 + }, + { + "epoch": 0.34718100890207715, + "grad_norm": 0.5784773826599121, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7460836172103882, + "num_tokens": 281131757.0, + "step": 585 + }, + { + "epoch": 0.34777448071216616, + "grad_norm": 0.5469129085540771, + "learning_rate": 1e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7511472702026367, + "num_tokens": 281596066.0, + "step": 586 + }, + { + "epoch": 0.3483679525222552, + "grad_norm": 0.5255166888237, + "learning_rate": 1e-06, + "loss": 0.828, + "mean_token_accuracy": 0.7462837100028992, + "num_tokens": 282098678.0, + "step": 587 + }, + { + "epoch": 0.3489614243323442, + "grad_norm": 0.5240246057510376, + "learning_rate": 1e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7674696445465088, + "num_tokens": 282600963.0, + "step": 588 + }, + { + "epoch": 0.34955489614243324, + "grad_norm": 0.5807877779006958, + "learning_rate": 1e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7482874393463135, + "num_tokens": 283044056.0, + "step": 589 + }, + { + "epoch": 0.35014836795252224, + "grad_norm": 0.5716282725334167, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7631107568740845, + "num_tokens": 283533094.0, + "step": 590 + }, + { + "epoch": 0.35074183976261125, + "grad_norm": 0.555046021938324, + "learning_rate": 1e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7575491666793823, + "num_tokens": 284021832.0, + "step": 591 + }, + { + "epoch": 0.3513353115727003, + "grad_norm": 0.5563094019889832, + "learning_rate": 1e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7597048282623291, + "num_tokens": 284491652.0, + "step": 592 + }, + { + "epoch": 0.3519287833827893, + "grad_norm": 0.6057441234588623, + "learning_rate": 1e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7511681914329529, + "num_tokens": 284943186.0, + "step": 593 + }, + { + "epoch": 0.35252225519287833, + "grad_norm": 0.5864159464836121, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7556041479110718, + "num_tokens": 285405371.0, + "step": 594 + }, + { + "epoch": 0.35311572700296734, + "grad_norm": 0.5314260125160217, + "learning_rate": 1e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7635293006896973, + "num_tokens": 285898224.0, + "step": 595 + }, + { + "epoch": 0.3537091988130564, + "grad_norm": 0.5569499135017395, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7516154646873474, + "num_tokens": 286366722.0, + "step": 596 + }, + { + "epoch": 0.3543026706231454, + "grad_norm": 0.560921847820282, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7594915628433228, + "num_tokens": 286850979.0, + "step": 597 + }, + { + "epoch": 0.3548961424332344, + "grad_norm": 0.575174868106842, + "learning_rate": 1e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7619478702545166, + "num_tokens": 287326346.0, + "step": 598 + }, + { + "epoch": 0.3554896142433234, + "grad_norm": 0.5423024296760559, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7434194087982178, + "num_tokens": 287774964.0, + "step": 599 + }, + { + "epoch": 0.3560830860534125, + "grad_norm": 0.5948648452758789, + "learning_rate": 1e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7554945945739746, + "num_tokens": 288244556.0, + "step": 600 + }, + { + "epoch": 0.3566765578635015, + "grad_norm": 0.5762646794319153, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7508524656295776, + "num_tokens": 288761719.0, + "step": 601 + }, + { + "epoch": 0.3572700296735905, + "grad_norm": 0.5639305114746094, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7482524514198303, + "num_tokens": 289219801.0, + "step": 602 + }, + { + "epoch": 0.3578635014836795, + "grad_norm": 0.5900662541389465, + "learning_rate": 1e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7543624639511108, + "num_tokens": 289678867.0, + "step": 603 + }, + { + "epoch": 0.3584569732937685, + "grad_norm": 0.5649274587631226, + "learning_rate": 1e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7554794549942017, + "num_tokens": 290158310.0, + "step": 604 + }, + { + "epoch": 0.3590504451038576, + "grad_norm": 0.5879603028297424, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.734619677066803, + "num_tokens": 290613192.0, + "step": 605 + }, + { + "epoch": 0.3596439169139466, + "grad_norm": 0.5451756119728088, + "learning_rate": 1e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7496539354324341, + "num_tokens": 291133868.0, + "step": 606 + }, + { + "epoch": 0.3602373887240356, + "grad_norm": 0.5350669026374817, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7531763315200806, + "num_tokens": 291652339.0, + "step": 607 + }, + { + "epoch": 0.3608308605341246, + "grad_norm": 0.566201388835907, + "learning_rate": 1e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7567121982574463, + "num_tokens": 292143803.0, + "step": 608 + }, + { + "epoch": 0.3614243323442137, + "grad_norm": 0.571175217628479, + "learning_rate": 1e-06, + "loss": 0.7582, + "mean_token_accuracy": 0.7631946206092834, + "num_tokens": 292585059.0, + "step": 609 + }, + { + "epoch": 0.3620178041543027, + "grad_norm": 0.5693539381027222, + "learning_rate": 1e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7631338834762573, + "num_tokens": 293057136.0, + "step": 610 + }, + { + "epoch": 0.3626112759643917, + "grad_norm": 0.5240808129310608, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7548725008964539, + "num_tokens": 293557859.0, + "step": 611 + }, + { + "epoch": 0.3632047477744807, + "grad_norm": 0.6070042252540588, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7375357151031494, + "num_tokens": 294032933.0, + "step": 612 + }, + { + "epoch": 0.36379821958456976, + "grad_norm": 0.5544180274009705, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7562078237533569, + "num_tokens": 294518517.0, + "step": 613 + }, + { + "epoch": 0.36439169139465877, + "grad_norm": 0.6115825176239014, + "learning_rate": 1e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7553347945213318, + "num_tokens": 294952266.0, + "step": 614 + }, + { + "epoch": 0.3649851632047478, + "grad_norm": 0.5365242958068848, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7605711221694946, + "num_tokens": 295427447.0, + "step": 615 + }, + { + "epoch": 0.3655786350148368, + "grad_norm": 0.6128702759742737, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7434370517730713, + "num_tokens": 295882606.0, + "step": 616 + }, + { + "epoch": 0.3661721068249258, + "grad_norm": 0.5846973657608032, + "learning_rate": 1e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7436656355857849, + "num_tokens": 296354520.0, + "step": 617 + }, + { + "epoch": 0.36676557863501486, + "grad_norm": 0.559330403804779, + "learning_rate": 1e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.7534429430961609, + "num_tokens": 296826902.0, + "step": 618 + }, + { + "epoch": 0.36735905044510386, + "grad_norm": 0.5464723706245422, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.76065993309021, + "num_tokens": 297312807.0, + "step": 619 + }, + { + "epoch": 0.36795252225519287, + "grad_norm": 0.6516916155815125, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7443974614143372, + "num_tokens": 297796189.0, + "step": 620 + }, + { + "epoch": 0.3685459940652819, + "grad_norm": 0.5433729887008667, + "learning_rate": 1e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7590651512145996, + "num_tokens": 298292996.0, + "step": 621 + }, + { + "epoch": 0.36913946587537094, + "grad_norm": 0.5573770403862, + "learning_rate": 1e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7532330751419067, + "num_tokens": 298768172.0, + "step": 622 + }, + { + "epoch": 0.36973293768545995, + "grad_norm": 0.5776697397232056, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7689776420593262, + "num_tokens": 299268714.0, + "step": 623 + }, + { + "epoch": 0.37032640949554896, + "grad_norm": 0.5592749118804932, + "learning_rate": 1e-06, + "loss": 0.8121, + "mean_token_accuracy": 0.7498830556869507, + "num_tokens": 299777168.0, + "step": 624 + }, + { + "epoch": 0.37091988130563797, + "grad_norm": 0.5720778703689575, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7653712034225464, + "num_tokens": 300306471.0, + "step": 625 + }, + { + "epoch": 0.37151335311572703, + "grad_norm": 0.5761945843696594, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.743789792060852, + "num_tokens": 300787772.0, + "step": 626 + }, + { + "epoch": 0.37210682492581604, + "grad_norm": 0.5071203708648682, + "learning_rate": 1e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7603079080581665, + "num_tokens": 301309625.0, + "step": 627 + }, + { + "epoch": 0.37270029673590505, + "grad_norm": 0.5475672483444214, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7499421834945679, + "num_tokens": 301793702.0, + "step": 628 + }, + { + "epoch": 0.37329376854599405, + "grad_norm": 0.5751859545707703, + "learning_rate": 1e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7487756609916687, + "num_tokens": 302251119.0, + "step": 629 + }, + { + "epoch": 0.37388724035608306, + "grad_norm": 0.5265706777572632, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7695062160491943, + "num_tokens": 302746400.0, + "step": 630 + }, + { + "epoch": 0.3744807121661721, + "grad_norm": 0.5408419966697693, + "learning_rate": 1e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7507607936859131, + "num_tokens": 303244713.0, + "step": 631 + }, + { + "epoch": 0.37507418397626113, + "grad_norm": 0.554083526134491, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7477618455886841, + "num_tokens": 303742621.0, + "step": 632 + }, + { + "epoch": 0.37566765578635014, + "grad_norm": 0.5490123629570007, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7467294931411743, + "num_tokens": 304227375.0, + "step": 633 + }, + { + "epoch": 0.37626112759643915, + "grad_norm": 0.5449267029762268, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7424583435058594, + "num_tokens": 304729440.0, + "step": 634 + }, + { + "epoch": 0.3768545994065282, + "grad_norm": 0.5460028648376465, + "learning_rate": 1e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7506463527679443, + "num_tokens": 305261410.0, + "step": 635 + }, + { + "epoch": 0.3774480712166172, + "grad_norm": 0.5863503813743591, + "learning_rate": 1e-06, + "loss": 0.7963, + "mean_token_accuracy": 0.7516270875930786, + "num_tokens": 305720974.0, + "step": 636 + }, + { + "epoch": 0.37804154302670623, + "grad_norm": 0.5884842276573181, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7454081177711487, + "num_tokens": 306155442.0, + "step": 637 + }, + { + "epoch": 0.37863501483679524, + "grad_norm": 0.555542528629303, + "learning_rate": 1e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7588081955909729, + "num_tokens": 306632062.0, + "step": 638 + }, + { + "epoch": 0.3792284866468843, + "grad_norm": 0.5815260410308838, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7597537636756897, + "num_tokens": 307079345.0, + "step": 639 + }, + { + "epoch": 0.3798219584569733, + "grad_norm": 0.5750163793563843, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7483726143836975, + "num_tokens": 307583243.0, + "step": 640 + }, + { + "epoch": 0.3804154302670623, + "grad_norm": 0.5586879253387451, + "learning_rate": 1e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7545939683914185, + "num_tokens": 308104996.0, + "step": 641 + }, + { + "epoch": 0.3810089020771513, + "grad_norm": 0.5713316798210144, + "learning_rate": 1e-06, + "loss": 0.7865, + "mean_token_accuracy": 0.7557711005210876, + "num_tokens": 308579396.0, + "step": 642 + }, + { + "epoch": 0.38160237388724033, + "grad_norm": 0.5501584410667419, + "learning_rate": 1e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7465464472770691, + "num_tokens": 309082808.0, + "step": 643 + }, + { + "epoch": 0.3821958456973294, + "grad_norm": 0.5833489298820496, + "learning_rate": 1e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7520277500152588, + "num_tokens": 309521410.0, + "step": 644 + }, + { + "epoch": 0.3827893175074184, + "grad_norm": 0.5201936960220337, + "learning_rate": 1e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7520032525062561, + "num_tokens": 310042573.0, + "step": 645 + }, + { + "epoch": 0.3833827893175074, + "grad_norm": 0.5891461968421936, + "learning_rate": 1e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7549723386764526, + "num_tokens": 310529928.0, + "step": 646 + }, + { + "epoch": 0.3839762611275964, + "grad_norm": 0.5323576331138611, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.74910569190979, + "num_tokens": 311025391.0, + "step": 647 + }, + { + "epoch": 0.3845697329376855, + "grad_norm": 0.5507381558418274, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7479363679885864, + "num_tokens": 311506470.0, + "step": 648 + }, + { + "epoch": 0.3851632047477745, + "grad_norm": 0.5571870803833008, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7329126596450806, + "num_tokens": 312002354.0, + "step": 649 + }, + { + "epoch": 0.3857566765578635, + "grad_norm": 0.5282754898071289, + "learning_rate": 1e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7585102319717407, + "num_tokens": 312505436.0, + "step": 650 + }, + { + "epoch": 0.3863501483679525, + "grad_norm": 0.5422891974449158, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7524042129516602, + "num_tokens": 312998699.0, + "step": 651 + }, + { + "epoch": 0.38694362017804157, + "grad_norm": 0.5514971613883972, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7480944395065308, + "num_tokens": 313461650.0, + "step": 652 + }, + { + "epoch": 0.3875370919881306, + "grad_norm": 0.5773888826370239, + "learning_rate": 1e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7508195042610168, + "num_tokens": 313948199.0, + "step": 653 + }, + { + "epoch": 0.3881305637982196, + "grad_norm": 0.594409704208374, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7339022159576416, + "num_tokens": 314431004.0, + "step": 654 + }, + { + "epoch": 0.3887240356083086, + "grad_norm": 0.5585455894470215, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.740021288394928, + "num_tokens": 314925257.0, + "step": 655 + }, + { + "epoch": 0.3893175074183976, + "grad_norm": 0.5803738236427307, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7391076683998108, + "num_tokens": 315410789.0, + "step": 656 + }, + { + "epoch": 0.38991097922848666, + "grad_norm": 0.5673246383666992, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7477308511734009, + "num_tokens": 315862789.0, + "step": 657 + }, + { + "epoch": 0.3905044510385757, + "grad_norm": 0.5410482287406921, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7403621077537537, + "num_tokens": 316349497.0, + "step": 658 + }, + { + "epoch": 0.3910979228486647, + "grad_norm": 0.5773849487304688, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7448775768280029, + "num_tokens": 316818682.0, + "step": 659 + }, + { + "epoch": 0.3916913946587537, + "grad_norm": 0.5943523645401001, + "learning_rate": 1e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7471504211425781, + "num_tokens": 317272363.0, + "step": 660 + }, + { + "epoch": 0.39228486646884275, + "grad_norm": 0.5016757845878601, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7679540514945984, + "num_tokens": 317821007.0, + "step": 661 + }, + { + "epoch": 0.39287833827893176, + "grad_norm": 0.5600405335426331, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7524677515029907, + "num_tokens": 318280806.0, + "step": 662 + }, + { + "epoch": 0.39347181008902077, + "grad_norm": 0.5649970173835754, + "learning_rate": 1e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7489396333694458, + "num_tokens": 318771823.0, + "step": 663 + }, + { + "epoch": 0.3940652818991098, + "grad_norm": 0.5394622087478638, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7585585117340088, + "num_tokens": 319284944.0, + "step": 664 + }, + { + "epoch": 0.39465875370919884, + "grad_norm": 0.5648723840713501, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7646071314811707, + "num_tokens": 319754933.0, + "step": 665 + }, + { + "epoch": 0.39525222551928785, + "grad_norm": 0.5723648071289062, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7361080646514893, + "num_tokens": 320206144.0, + "step": 666 + }, + { + "epoch": 0.39584569732937686, + "grad_norm": 0.5446099638938904, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7652710676193237, + "num_tokens": 320695029.0, + "step": 667 + }, + { + "epoch": 0.39643916913946586, + "grad_norm": 0.5694246292114258, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7396096587181091, + "num_tokens": 321166817.0, + "step": 668 + }, + { + "epoch": 0.39703264094955487, + "grad_norm": 0.5801451206207275, + "learning_rate": 1e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.75074303150177, + "num_tokens": 321645038.0, + "step": 669 + }, + { + "epoch": 0.39762611275964393, + "grad_norm": 0.5736537575721741, + "learning_rate": 1e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.755815327167511, + "num_tokens": 322104748.0, + "step": 670 + }, + { + "epoch": 0.39821958456973294, + "grad_norm": 0.5486465096473694, + "learning_rate": 1e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7491623759269714, + "num_tokens": 322587687.0, + "step": 671 + }, + { + "epoch": 0.39881305637982195, + "grad_norm": 0.5411856174468994, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.747677743434906, + "num_tokens": 323063890.0, + "step": 672 + }, + { + "epoch": 0.39940652818991096, + "grad_norm": 0.5564053058624268, + "learning_rate": 1e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7586410045623779, + "num_tokens": 323578382.0, + "step": 673 + }, + { + "epoch": 0.4, + "grad_norm": 0.5521569848060608, + "learning_rate": 1e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7565730810165405, + "num_tokens": 324066387.0, + "step": 674 + }, + { + "epoch": 0.40059347181008903, + "grad_norm": 0.5697529315948486, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7373706102371216, + "num_tokens": 324539695.0, + "step": 675 + }, + { + "epoch": 0.40118694362017804, + "grad_norm": 0.5608291029930115, + "learning_rate": 1e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7611640691757202, + "num_tokens": 325047753.0, + "step": 676 + }, + { + "epoch": 0.40178041543026705, + "grad_norm": 0.6088599562644958, + "learning_rate": 1e-06, + "loss": 0.7954, + "mean_token_accuracy": 0.7535783648490906, + "num_tokens": 325552429.0, + "step": 677 + }, + { + "epoch": 0.4023738872403561, + "grad_norm": 0.6074817180633545, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7469769716262817, + "num_tokens": 325969758.0, + "step": 678 + }, + { + "epoch": 0.4029673590504451, + "grad_norm": 0.5552223920822144, + "learning_rate": 1e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7564205527305603, + "num_tokens": 326391658.0, + "step": 679 + }, + { + "epoch": 0.4035608308605341, + "grad_norm": 0.5465179085731506, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7452394962310791, + "num_tokens": 326879597.0, + "step": 680 + }, + { + "epoch": 0.40415430267062313, + "grad_norm": 0.5294557809829712, + "learning_rate": 1e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7566651105880737, + "num_tokens": 327401418.0, + "step": 681 + }, + { + "epoch": 0.40474777448071214, + "grad_norm": 0.5531354546546936, + "learning_rate": 1e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7620320320129395, + "num_tokens": 327913081.0, + "step": 682 + }, + { + "epoch": 0.4053412462908012, + "grad_norm": 0.5848676562309265, + "learning_rate": 1e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7613532543182373, + "num_tokens": 328402418.0, + "step": 683 + }, + { + "epoch": 0.4059347181008902, + "grad_norm": 0.5601630806922913, + "learning_rate": 1e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7611477971076965, + "num_tokens": 328874424.0, + "step": 684 + }, + { + "epoch": 0.4065281899109792, + "grad_norm": 0.5469821691513062, + "learning_rate": 1e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7466841340065002, + "num_tokens": 329353277.0, + "step": 685 + }, + { + "epoch": 0.40712166172106823, + "grad_norm": 0.6236651539802551, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7661000490188599, + "num_tokens": 329790001.0, + "step": 686 + }, + { + "epoch": 0.4077151335311573, + "grad_norm": 0.568986177444458, + "learning_rate": 1e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7516487836837769, + "num_tokens": 330290881.0, + "step": 687 + }, + { + "epoch": 0.4083086053412463, + "grad_norm": 0.5460367202758789, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7473405599594116, + "num_tokens": 330765906.0, + "step": 688 + }, + { + "epoch": 0.4089020771513353, + "grad_norm": 0.5746333599090576, + "learning_rate": 1e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7532278895378113, + "num_tokens": 331189520.0, + "step": 689 + }, + { + "epoch": 0.4094955489614243, + "grad_norm": 0.5765677690505981, + "learning_rate": 1e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7514903545379639, + "num_tokens": 331640686.0, + "step": 690 + }, + { + "epoch": 0.4100890207715134, + "grad_norm": 0.5665356516838074, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.76529461145401, + "num_tokens": 332083237.0, + "step": 691 + }, + { + "epoch": 0.4106824925816024, + "grad_norm": 0.5599573850631714, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7399706244468689, + "num_tokens": 332577387.0, + "step": 692 + }, + { + "epoch": 0.4112759643916914, + "grad_norm": 0.6093911528587341, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7317517995834351, + "num_tokens": 332981033.0, + "step": 693 + }, + { + "epoch": 0.4118694362017804, + "grad_norm": 0.5658978223800659, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7669205665588379, + "num_tokens": 333399804.0, + "step": 694 + }, + { + "epoch": 0.4124629080118694, + "grad_norm": 0.5750860571861267, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7525268793106079, + "num_tokens": 333899143.0, + "step": 695 + }, + { + "epoch": 0.4130563798219585, + "grad_norm": 0.5394628047943115, + "learning_rate": 1e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.7506676316261292, + "num_tokens": 334393659.0, + "step": 696 + }, + { + "epoch": 0.4136498516320475, + "grad_norm": 0.5293723940849304, + "learning_rate": 1e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7530666589736938, + "num_tokens": 334826412.0, + "step": 697 + }, + { + "epoch": 0.4142433234421365, + "grad_norm": 0.5406870245933533, + "learning_rate": 1e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7555041909217834, + "num_tokens": 335302457.0, + "step": 698 + }, + { + "epoch": 0.4148367952522255, + "grad_norm": 0.5417015552520752, + "learning_rate": 1e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.7499828338623047, + "num_tokens": 335793530.0, + "step": 699 + }, + { + "epoch": 0.41543026706231456, + "grad_norm": 0.5237062573432922, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7516801953315735, + "num_tokens": 336294432.0, + "step": 700 + }, + { + "epoch": 0.41602373887240357, + "grad_norm": 0.5136372447013855, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7453574538230896, + "num_tokens": 336810499.0, + "step": 701 + }, + { + "epoch": 0.4166172106824926, + "grad_norm": 0.5829080939292908, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7492295503616333, + "num_tokens": 337257991.0, + "step": 702 + }, + { + "epoch": 0.4172106824925816, + "grad_norm": 0.546502411365509, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7562495470046997, + "num_tokens": 337730511.0, + "step": 703 + }, + { + "epoch": 0.41780415430267065, + "grad_norm": 0.5563671588897705, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7306000590324402, + "num_tokens": 338234724.0, + "step": 704 + }, + { + "epoch": 0.41839762611275966, + "grad_norm": 0.5603218674659729, + "learning_rate": 1e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7484540939331055, + "num_tokens": 338713776.0, + "step": 705 + }, + { + "epoch": 0.41899109792284867, + "grad_norm": 0.5271628499031067, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7689099907875061, + "num_tokens": 339224081.0, + "step": 706 + }, + { + "epoch": 0.4195845697329377, + "grad_norm": 0.5888941884040833, + "learning_rate": 1e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7421530485153198, + "num_tokens": 339681850.0, + "step": 707 + }, + { + "epoch": 0.4201780415430267, + "grad_norm": 0.5761911869049072, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7610605359077454, + "num_tokens": 340147126.0, + "step": 708 + }, + { + "epoch": 0.42077151335311574, + "grad_norm": 0.559110164642334, + "learning_rate": 1e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7608551383018494, + "num_tokens": 340674054.0, + "step": 709 + }, + { + "epoch": 0.42136498516320475, + "grad_norm": 0.5726150274276733, + "learning_rate": 1e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7553840279579163, + "num_tokens": 341115299.0, + "step": 710 + }, + { + "epoch": 0.42195845697329376, + "grad_norm": 0.5551873445510864, + "learning_rate": 1e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7523263096809387, + "num_tokens": 341656676.0, + "step": 711 + }, + { + "epoch": 0.42255192878338277, + "grad_norm": 0.595927894115448, + "learning_rate": 1e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7587463855743408, + "num_tokens": 342116723.0, + "step": 712 + }, + { + "epoch": 0.42314540059347183, + "grad_norm": 0.5542543530464172, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7603583335876465, + "num_tokens": 342579365.0, + "step": 713 + }, + { + "epoch": 0.42373887240356084, + "grad_norm": 0.5437420606613159, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7482599020004272, + "num_tokens": 343060073.0, + "step": 714 + }, + { + "epoch": 0.42433234421364985, + "grad_norm": 0.5550129413604736, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7455755472183228, + "num_tokens": 343534224.0, + "step": 715 + }, + { + "epoch": 0.42492581602373886, + "grad_norm": 0.5533819198608398, + "learning_rate": 1e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7496623992919922, + "num_tokens": 344006915.0, + "step": 716 + }, + { + "epoch": 0.4255192878338279, + "grad_norm": 0.5432994365692139, + "learning_rate": 1e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7532941102981567, + "num_tokens": 344516448.0, + "step": 717 + }, + { + "epoch": 0.4261127596439169, + "grad_norm": 0.5502431988716125, + "learning_rate": 1e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7586491107940674, + "num_tokens": 345005895.0, + "step": 718 + }, + { + "epoch": 0.42670623145400594, + "grad_norm": 0.5944370627403259, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7473291754722595, + "num_tokens": 345429029.0, + "step": 719 + }, + { + "epoch": 0.42729970326409494, + "grad_norm": 0.5458663702011108, + "learning_rate": 1e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7545507550239563, + "num_tokens": 345909921.0, + "step": 720 + }, + { + "epoch": 0.42789317507418395, + "grad_norm": 0.5382313132286072, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7560734748840332, + "num_tokens": 346383782.0, + "step": 721 + }, + { + "epoch": 0.428486646884273, + "grad_norm": 0.565782904624939, + "learning_rate": 1e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7448025941848755, + "num_tokens": 346857288.0, + "step": 722 + }, + { + "epoch": 0.429080118694362, + "grad_norm": 0.552242636680603, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7470074892044067, + "num_tokens": 347328270.0, + "step": 723 + }, + { + "epoch": 0.42967359050445103, + "grad_norm": 0.5025943517684937, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7669658064842224, + "num_tokens": 347850018.0, + "step": 724 + }, + { + "epoch": 0.43026706231454004, + "grad_norm": 0.5303850173950195, + "learning_rate": 1e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7494666576385498, + "num_tokens": 348345307.0, + "step": 725 + }, + { + "epoch": 0.4308605341246291, + "grad_norm": 0.5723504424095154, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7471235990524292, + "num_tokens": 348807097.0, + "step": 726 + }, + { + "epoch": 0.4314540059347181, + "grad_norm": 0.5869706869125366, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7631680369377136, + "num_tokens": 349247260.0, + "step": 727 + }, + { + "epoch": 0.4320474777448071, + "grad_norm": 0.5869740843772888, + "learning_rate": 1e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7520400285720825, + "num_tokens": 349716049.0, + "step": 728 + }, + { + "epoch": 0.4326409495548961, + "grad_norm": 0.5310616493225098, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7513155341148376, + "num_tokens": 350187664.0, + "step": 729 + }, + { + "epoch": 0.4332344213649852, + "grad_norm": 0.5679337978363037, + "learning_rate": 1e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7693670392036438, + "num_tokens": 350635276.0, + "step": 730 + }, + { + "epoch": 0.4338278931750742, + "grad_norm": 0.5780270099639893, + "learning_rate": 1e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7573405504226685, + "num_tokens": 351105406.0, + "step": 731 + }, + { + "epoch": 0.4344213649851632, + "grad_norm": 0.5856975317001343, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7297420501708984, + "num_tokens": 351527524.0, + "step": 732 + }, + { + "epoch": 0.4350148367952522, + "grad_norm": 0.549580991268158, + "learning_rate": 1e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7532798051834106, + "num_tokens": 351991745.0, + "step": 733 + }, + { + "epoch": 0.4356083086053412, + "grad_norm": 0.5601057410240173, + "learning_rate": 1e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7640630006790161, + "num_tokens": 352479617.0, + "step": 734 + }, + { + "epoch": 0.4362017804154303, + "grad_norm": 0.5495672225952148, + "learning_rate": 1e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7489907741546631, + "num_tokens": 352950141.0, + "step": 735 + }, + { + "epoch": 0.4367952522255193, + "grad_norm": 0.5391093492507935, + "learning_rate": 1e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7558225989341736, + "num_tokens": 353438019.0, + "step": 736 + }, + { + "epoch": 0.4373887240356083, + "grad_norm": 0.5112774968147278, + "learning_rate": 1e-06, + "loss": 0.808, + "mean_token_accuracy": 0.75117027759552, + "num_tokens": 353972755.0, + "step": 737 + }, + { + "epoch": 0.4379821958456973, + "grad_norm": 0.5397323369979858, + "learning_rate": 1e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7508024573326111, + "num_tokens": 354430793.0, + "step": 738 + }, + { + "epoch": 0.43857566765578637, + "grad_norm": 0.5524953007698059, + "learning_rate": 1e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7770769000053406, + "num_tokens": 354920770.0, + "step": 739 + }, + { + "epoch": 0.4391691394658754, + "grad_norm": 0.5627418756484985, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7555370330810547, + "num_tokens": 355385587.0, + "step": 740 + }, + { + "epoch": 0.4397626112759644, + "grad_norm": 0.5705970525741577, + "learning_rate": 1e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.7458717823028564, + "num_tokens": 355843597.0, + "step": 741 + }, + { + "epoch": 0.4403560830860534, + "grad_norm": 0.5392569303512573, + "learning_rate": 1e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7682050466537476, + "num_tokens": 356312231.0, + "step": 742 + }, + { + "epoch": 0.44094955489614246, + "grad_norm": 0.5771243572235107, + "learning_rate": 1e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7545139789581299, + "num_tokens": 356778287.0, + "step": 743 + }, + { + "epoch": 0.44154302670623147, + "grad_norm": 0.5535919666290283, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7668114900588989, + "num_tokens": 357267491.0, + "step": 744 + }, + { + "epoch": 0.4421364985163205, + "grad_norm": 0.5139079093933105, + "learning_rate": 1e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.7551300525665283, + "num_tokens": 357775282.0, + "step": 745 + }, + { + "epoch": 0.4427299703264095, + "grad_norm": 0.5911197662353516, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7570915818214417, + "num_tokens": 358272385.0, + "step": 746 + }, + { + "epoch": 0.4433234421364985, + "grad_norm": 0.5683233737945557, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7319771647453308, + "num_tokens": 358762525.0, + "step": 747 + }, + { + "epoch": 0.44391691394658755, + "grad_norm": 0.5392531156539917, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7476528882980347, + "num_tokens": 359273233.0, + "step": 748 + }, + { + "epoch": 0.44451038575667656, + "grad_norm": 0.5892350077629089, + "learning_rate": 1e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7538102865219116, + "num_tokens": 359754733.0, + "step": 749 + }, + { + "epoch": 0.44510385756676557, + "grad_norm": 0.5708402395248413, + "learning_rate": 1e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7534244656562805, + "num_tokens": 360218776.0, + "step": 750 + }, + { + "epoch": 0.4456973293768546, + "grad_norm": 0.5836385488510132, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7384628057479858, + "num_tokens": 360673646.0, + "step": 751 + }, + { + "epoch": 0.44629080118694364, + "grad_norm": 0.5728083848953247, + "learning_rate": 1e-06, + "loss": 0.7898, + "mean_token_accuracy": 0.7553647756576538, + "num_tokens": 361110596.0, + "step": 752 + }, + { + "epoch": 0.44688427299703265, + "grad_norm": 0.5976822972297668, + "learning_rate": 1e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7572668790817261, + "num_tokens": 361593758.0, + "step": 753 + }, + { + "epoch": 0.44747774480712166, + "grad_norm": 0.5697898864746094, + "learning_rate": 1e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7618194818496704, + "num_tokens": 362099187.0, + "step": 754 + }, + { + "epoch": 0.44807121661721067, + "grad_norm": 0.5492198467254639, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7706893682479858, + "num_tokens": 362583325.0, + "step": 755 + }, + { + "epoch": 0.44866468842729973, + "grad_norm": 0.5636993646621704, + "learning_rate": 1e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.758294939994812, + "num_tokens": 363057317.0, + "step": 756 + }, + { + "epoch": 0.44925816023738874, + "grad_norm": 0.5592188835144043, + "learning_rate": 1e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7651373147964478, + "num_tokens": 363519530.0, + "step": 757 + }, + { + "epoch": 0.44985163204747775, + "grad_norm": 0.5725254416465759, + "learning_rate": 1e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7510793209075928, + "num_tokens": 363988243.0, + "step": 758 + }, + { + "epoch": 0.45044510385756675, + "grad_norm": 0.5991626977920532, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7402548789978027, + "num_tokens": 364451915.0, + "step": 759 + }, + { + "epoch": 0.45103857566765576, + "grad_norm": 0.5534422397613525, + "learning_rate": 1e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.7486854791641235, + "num_tokens": 364932628.0, + "step": 760 + }, + { + "epoch": 0.4516320474777448, + "grad_norm": 0.5721704959869385, + "learning_rate": 1e-06, + "loss": 0.8067, + "mean_token_accuracy": 0.7456393241882324, + "num_tokens": 365399564.0, + "step": 761 + }, + { + "epoch": 0.45222551928783383, + "grad_norm": 0.5552239418029785, + "learning_rate": 1e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.748237133026123, + "num_tokens": 365913291.0, + "step": 762 + }, + { + "epoch": 0.45281899109792284, + "grad_norm": 0.5653426647186279, + "learning_rate": 1e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7483237385749817, + "num_tokens": 366371220.0, + "step": 763 + }, + { + "epoch": 0.45341246290801185, + "grad_norm": 0.5380280613899231, + "learning_rate": 1e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7494775652885437, + "num_tokens": 366852925.0, + "step": 764 + }, + { + "epoch": 0.4540059347181009, + "grad_norm": 0.5286559462547302, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.744308352470398, + "num_tokens": 367368027.0, + "step": 765 + }, + { + "epoch": 0.4545994065281899, + "grad_norm": 0.5403891801834106, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7655724287033081, + "num_tokens": 367859438.0, + "step": 766 + }, + { + "epoch": 0.45519287833827893, + "grad_norm": 0.5456708073616028, + "learning_rate": 1e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7574352025985718, + "num_tokens": 368351161.0, + "step": 767 + }, + { + "epoch": 0.45578635014836794, + "grad_norm": 0.5330765247344971, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7510629892349243, + "num_tokens": 368879782.0, + "step": 768 + }, + { + "epoch": 0.456379821958457, + "grad_norm": 0.5342236161231995, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7464929819107056, + "num_tokens": 369388550.0, + "step": 769 + }, + { + "epoch": 0.456973293768546, + "grad_norm": 0.5470307469367981, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.748131275177002, + "num_tokens": 369877716.0, + "step": 770 + }, + { + "epoch": 0.457566765578635, + "grad_norm": 0.5580595135688782, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7316849231719971, + "num_tokens": 370350114.0, + "step": 771 + }, + { + "epoch": 0.458160237388724, + "grad_norm": 0.5700104832649231, + "learning_rate": 1e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7571947574615479, + "num_tokens": 370830585.0, + "step": 772 + }, + { + "epoch": 0.45875370919881303, + "grad_norm": 0.5660414099693298, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7393524050712585, + "num_tokens": 371272114.0, + "step": 773 + }, + { + "epoch": 0.4593471810089021, + "grad_norm": 0.540190577507019, + "learning_rate": 1e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.758215069770813, + "num_tokens": 371801167.0, + "step": 774 + }, + { + "epoch": 0.4599406528189911, + "grad_norm": 0.5371624231338501, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7437485456466675, + "num_tokens": 372335200.0, + "step": 775 + }, + { + "epoch": 0.4605341246290801, + "grad_norm": 0.5585528612136841, + "learning_rate": 1e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7546184659004211, + "num_tokens": 372819982.0, + "step": 776 + }, + { + "epoch": 0.4611275964391691, + "grad_norm": 0.6008590459823608, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7383506298065186, + "num_tokens": 373234321.0, + "step": 777 + }, + { + "epoch": 0.4617210682492582, + "grad_norm": 0.5766400098800659, + "learning_rate": 1e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7495560050010681, + "num_tokens": 373730676.0, + "step": 778 + }, + { + "epoch": 0.4623145400593472, + "grad_norm": 0.5846348404884338, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7478852868080139, + "num_tokens": 374188967.0, + "step": 779 + }, + { + "epoch": 0.4629080118694362, + "grad_norm": 0.5595541000366211, + "learning_rate": 1e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7689139246940613, + "num_tokens": 374652765.0, + "step": 780 + }, + { + "epoch": 0.4635014836795252, + "grad_norm": 0.5636993050575256, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7565938830375671, + "num_tokens": 375134100.0, + "step": 781 + }, + { + "epoch": 0.46409495548961427, + "grad_norm": 0.5532126426696777, + "learning_rate": 1e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7456442713737488, + "num_tokens": 375624226.0, + "step": 782 + }, + { + "epoch": 0.4646884272997033, + "grad_norm": 0.5708792209625244, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7686219811439514, + "num_tokens": 376069325.0, + "step": 783 + }, + { + "epoch": 0.4652818991097923, + "grad_norm": 0.5956181287765503, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.753544270992279, + "num_tokens": 376510791.0, + "step": 784 + }, + { + "epoch": 0.4658753709198813, + "grad_norm": 0.5424932837486267, + "learning_rate": 1e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7497167587280273, + "num_tokens": 376991408.0, + "step": 785 + }, + { + "epoch": 0.4664688427299703, + "grad_norm": 0.561553418636322, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7358737587928772, + "num_tokens": 377453702.0, + "step": 786 + }, + { + "epoch": 0.46706231454005936, + "grad_norm": 0.6025390625, + "learning_rate": 1e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7490152716636658, + "num_tokens": 377923232.0, + "step": 787 + }, + { + "epoch": 0.4676557863501484, + "grad_norm": 0.5476700067520142, + "learning_rate": 1e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7811021208763123, + "num_tokens": 378428064.0, + "step": 788 + }, + { + "epoch": 0.4682492581602374, + "grad_norm": 0.5191574692726135, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7453975081443787, + "num_tokens": 378936949.0, + "step": 789 + }, + { + "epoch": 0.4688427299703264, + "grad_norm": 0.5562506914138794, + "learning_rate": 1e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.747445285320282, + "num_tokens": 379402971.0, + "step": 790 + }, + { + "epoch": 0.46943620178041545, + "grad_norm": 0.5550343990325928, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7533220648765564, + "num_tokens": 379883497.0, + "step": 791 + }, + { + "epoch": 0.47002967359050446, + "grad_norm": 0.5770052671432495, + "learning_rate": 1e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.7646129727363586, + "num_tokens": 380350849.0, + "step": 792 + }, + { + "epoch": 0.47062314540059347, + "grad_norm": 0.544775664806366, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7508296966552734, + "num_tokens": 380848661.0, + "step": 793 + }, + { + "epoch": 0.4712166172106825, + "grad_norm": 0.5278334021568298, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7655129432678223, + "num_tokens": 381387823.0, + "step": 794 + }, + { + "epoch": 0.47181008902077154, + "grad_norm": 0.568598747253418, + "learning_rate": 1e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7528822422027588, + "num_tokens": 381909876.0, + "step": 795 + }, + { + "epoch": 0.47240356083086055, + "grad_norm": 0.54609215259552, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7460508346557617, + "num_tokens": 382407647.0, + "step": 796 + }, + { + "epoch": 0.47299703264094956, + "grad_norm": 0.5784177184104919, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7460424304008484, + "num_tokens": 382890934.0, + "step": 797 + }, + { + "epoch": 0.47359050445103856, + "grad_norm": 0.5856095552444458, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7455266118049622, + "num_tokens": 383349345.0, + "step": 798 + }, + { + "epoch": 0.47418397626112757, + "grad_norm": 0.5708518624305725, + "learning_rate": 1e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7534188628196716, + "num_tokens": 383820233.0, + "step": 799 + }, + { + "epoch": 0.47477744807121663, + "grad_norm": 0.532041072845459, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7496809959411621, + "num_tokens": 384320690.0, + "step": 800 + }, + { + "epoch": 0.47537091988130564, + "grad_norm": 0.5202295184135437, + "learning_rate": 1e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7653578519821167, + "num_tokens": 384810854.0, + "step": 801 + }, + { + "epoch": 0.47596439169139465, + "grad_norm": 0.5445143580436707, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7504401803016663, + "num_tokens": 385322455.0, + "step": 802 + }, + { + "epoch": 0.47655786350148366, + "grad_norm": 0.5518936514854431, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7421913146972656, + "num_tokens": 385788912.0, + "step": 803 + }, + { + "epoch": 0.4771513353115727, + "grad_norm": 0.5715309977531433, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7551127672195435, + "num_tokens": 386266072.0, + "step": 804 + }, + { + "epoch": 0.47774480712166173, + "grad_norm": 0.5609148144721985, + "learning_rate": 1e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7738053798675537, + "num_tokens": 386720268.0, + "step": 805 + }, + { + "epoch": 0.47833827893175074, + "grad_norm": 0.5348259806632996, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7464404106140137, + "num_tokens": 387213349.0, + "step": 806 + }, + { + "epoch": 0.47893175074183975, + "grad_norm": 0.5327772498130798, + "learning_rate": 1e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7585949897766113, + "num_tokens": 387731058.0, + "step": 807 + }, + { + "epoch": 0.4795252225519288, + "grad_norm": 0.5320379734039307, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7379993796348572, + "num_tokens": 388235286.0, + "step": 808 + }, + { + "epoch": 0.4801186943620178, + "grad_norm": 0.5714460611343384, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7540408372879028, + "num_tokens": 388766754.0, + "step": 809 + }, + { + "epoch": 0.4807121661721068, + "grad_norm": 0.5743693709373474, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7545868158340454, + "num_tokens": 389228577.0, + "step": 810 + }, + { + "epoch": 0.48130563798219583, + "grad_norm": 0.563279390335083, + "learning_rate": 1e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7527661323547363, + "num_tokens": 389634960.0, + "step": 811 + }, + { + "epoch": 0.48189910979228484, + "grad_norm": 0.605990469455719, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7523877620697021, + "num_tokens": 390090889.0, + "step": 812 + }, + { + "epoch": 0.4824925816023739, + "grad_norm": 0.6071136593818665, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7554664611816406, + "num_tokens": 390577469.0, + "step": 813 + }, + { + "epoch": 0.4830860534124629, + "grad_norm": 0.5406844019889832, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7533854246139526, + "num_tokens": 391079944.0, + "step": 814 + }, + { + "epoch": 0.4836795252225519, + "grad_norm": 0.5753489136695862, + "learning_rate": 1e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.7481876015663147, + "num_tokens": 391571512.0, + "step": 815 + }, + { + "epoch": 0.48427299703264093, + "grad_norm": 0.5940549969673157, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.74949711561203, + "num_tokens": 392050697.0, + "step": 816 + }, + { + "epoch": 0.48486646884273, + "grad_norm": 0.5501951575279236, + "learning_rate": 1e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.750166654586792, + "num_tokens": 392502529.0, + "step": 817 + }, + { + "epoch": 0.485459940652819, + "grad_norm": 0.5450282096862793, + "learning_rate": 1e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7555243968963623, + "num_tokens": 393001590.0, + "step": 818 + }, + { + "epoch": 0.486053412462908, + "grad_norm": 0.5499124526977539, + "learning_rate": 1e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7628515958786011, + "num_tokens": 393494395.0, + "step": 819 + }, + { + "epoch": 0.486646884272997, + "grad_norm": 0.5617973208427429, + "learning_rate": 1e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.756775975227356, + "num_tokens": 393954458.0, + "step": 820 + }, + { + "epoch": 0.4872403560830861, + "grad_norm": 0.5425310730934143, + "learning_rate": 1e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7561733722686768, + "num_tokens": 394431077.0, + "step": 821 + }, + { + "epoch": 0.4878338278931751, + "grad_norm": 0.5775155425071716, + "learning_rate": 1e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7566568851470947, + "num_tokens": 394891354.0, + "step": 822 + }, + { + "epoch": 0.4884272997032641, + "grad_norm": 0.623906672000885, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.751537561416626, + "num_tokens": 395294879.0, + "step": 823 + }, + { + "epoch": 0.4890207715133531, + "grad_norm": 0.580875813961029, + "learning_rate": 1e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7536903619766235, + "num_tokens": 395737822.0, + "step": 824 + }, + { + "epoch": 0.4896142433234421, + "grad_norm": 0.5618726015090942, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7617065906524658, + "num_tokens": 396220314.0, + "step": 825 + }, + { + "epoch": 0.4902077151335312, + "grad_norm": 0.5910845398902893, + "learning_rate": 1e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7473633289337158, + "num_tokens": 396685785.0, + "step": 826 + }, + { + "epoch": 0.4908011869436202, + "grad_norm": 0.5502856969833374, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7437036037445068, + "num_tokens": 397186636.0, + "step": 827 + }, + { + "epoch": 0.4913946587537092, + "grad_norm": 0.5847481489181519, + "learning_rate": 1e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.765493631362915, + "num_tokens": 397643288.0, + "step": 828 + }, + { + "epoch": 0.4919881305637982, + "grad_norm": 0.5828969478607178, + "learning_rate": 1e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7528236508369446, + "num_tokens": 398092382.0, + "step": 829 + }, + { + "epoch": 0.49258160237388726, + "grad_norm": 0.638439953327179, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7531532645225525, + "num_tokens": 398528994.0, + "step": 830 + }, + { + "epoch": 0.49317507418397627, + "grad_norm": 0.5988095998764038, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7541864514350891, + "num_tokens": 399019097.0, + "step": 831 + }, + { + "epoch": 0.4937685459940653, + "grad_norm": 0.5777338147163391, + "learning_rate": 1e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7535659670829773, + "num_tokens": 399480378.0, + "step": 832 + }, + { + "epoch": 0.4943620178041543, + "grad_norm": 0.586035430431366, + "learning_rate": 1e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7629550099372864, + "num_tokens": 399949598.0, + "step": 833 + }, + { + "epoch": 0.49495548961424335, + "grad_norm": 0.5150814652442932, + "learning_rate": 1e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7683749198913574, + "num_tokens": 400457846.0, + "step": 834 + }, + { + "epoch": 0.49554896142433236, + "grad_norm": 0.5511972308158875, + "learning_rate": 1e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.752955973148346, + "num_tokens": 400951669.0, + "step": 835 + }, + { + "epoch": 0.49614243323442137, + "grad_norm": 0.5694829821586609, + "learning_rate": 1e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.752372682094574, + "num_tokens": 401419496.0, + "step": 836 + }, + { + "epoch": 0.4967359050445104, + "grad_norm": 0.555248498916626, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7467404007911682, + "num_tokens": 401885929.0, + "step": 837 + }, + { + "epoch": 0.4973293768545994, + "grad_norm": 0.5272230505943298, + "learning_rate": 1e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.767291247844696, + "num_tokens": 402377788.0, + "step": 838 + }, + { + "epoch": 0.49792284866468844, + "grad_norm": 0.5582213401794434, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.748894453048706, + "num_tokens": 402836598.0, + "step": 839 + }, + { + "epoch": 0.49851632047477745, + "grad_norm": 0.5714107155799866, + "learning_rate": 1e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.763260006904602, + "num_tokens": 403292234.0, + "step": 840 + }, + { + "epoch": 0.49910979228486646, + "grad_norm": 0.5783384442329407, + "learning_rate": 1e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7493602633476257, + "num_tokens": 403786444.0, + "step": 841 + }, + { + "epoch": 0.49970326409495547, + "grad_norm": 0.548602819442749, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7549030780792236, + "num_tokens": 404231178.0, + "step": 842 + }, + { + "epoch": 0.5002967359050445, + "grad_norm": 0.5472541451454163, + "learning_rate": 1e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7653549909591675, + "num_tokens": 404725087.0, + "step": 843 + }, + { + "epoch": 0.5008902077151335, + "grad_norm": 0.5640508532524109, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7653967142105103, + "num_tokens": 405200081.0, + "step": 844 + }, + { + "epoch": 0.5014836795252225, + "grad_norm": 0.5943348407745361, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7588973045349121, + "num_tokens": 405641730.0, + "step": 845 + }, + { + "epoch": 0.5020771513353116, + "grad_norm": 0.5553693771362305, + "learning_rate": 1e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.7571490406990051, + "num_tokens": 406115871.0, + "step": 846 + }, + { + "epoch": 0.5026706231454006, + "grad_norm": 0.552622377872467, + "learning_rate": 1e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.75911545753479, + "num_tokens": 406572827.0, + "step": 847 + }, + { + "epoch": 0.5032640949554896, + "grad_norm": 0.5524778366088867, + "learning_rate": 1e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7570802569389343, + "num_tokens": 407077427.0, + "step": 848 + }, + { + "epoch": 0.5038575667655787, + "grad_norm": 0.5739008188247681, + "learning_rate": 1e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7568336725234985, + "num_tokens": 407554585.0, + "step": 849 + }, + { + "epoch": 0.5044510385756676, + "grad_norm": 0.5479874610900879, + "learning_rate": 1e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.766028642654419, + "num_tokens": 408043871.0, + "step": 850 + }, + { + "epoch": 0.5050445103857567, + "grad_norm": 0.5504869222640991, + "learning_rate": 1e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7585635781288147, + "num_tokens": 408512038.0, + "step": 851 + }, + { + "epoch": 0.5056379821958457, + "grad_norm": 0.5697388052940369, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7548317909240723, + "num_tokens": 408979628.0, + "step": 852 + }, + { + "epoch": 0.5062314540059347, + "grad_norm": 0.5639750361442566, + "learning_rate": 1e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7547666430473328, + "num_tokens": 409508931.0, + "step": 853 + }, + { + "epoch": 0.5068249258160238, + "grad_norm": 0.5387076139450073, + "learning_rate": 1e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7641481161117554, + "num_tokens": 410007051.0, + "step": 854 + }, + { + "epoch": 0.5074183976261127, + "grad_norm": 0.5404236912727356, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7612900733947754, + "num_tokens": 410487754.0, + "step": 855 + }, + { + "epoch": 0.5080118694362018, + "grad_norm": 0.565373420715332, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.735049843788147, + "num_tokens": 410951342.0, + "step": 856 + }, + { + "epoch": 0.5086053412462908, + "grad_norm": 0.56635981798172, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7570415735244751, + "num_tokens": 411417828.0, + "step": 857 + }, + { + "epoch": 0.5091988130563798, + "grad_norm": 0.5409464836120605, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7442117929458618, + "num_tokens": 411905379.0, + "step": 858 + }, + { + "epoch": 0.5097922848664689, + "grad_norm": 0.582057774066925, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7501811981201172, + "num_tokens": 412354827.0, + "step": 859 + }, + { + "epoch": 0.5103857566765578, + "grad_norm": 0.5452066659927368, + "learning_rate": 1e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7531661987304688, + "num_tokens": 412828844.0, + "step": 860 + }, + { + "epoch": 0.5109792284866469, + "grad_norm": 0.5794256329536438, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7539281845092773, + "num_tokens": 413281939.0, + "step": 861 + }, + { + "epoch": 0.511572700296736, + "grad_norm": 0.5394991636276245, + "learning_rate": 1e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.763791561126709, + "num_tokens": 413767527.0, + "step": 862 + }, + { + "epoch": 0.5121661721068249, + "grad_norm": 0.580703854560852, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7403552532196045, + "num_tokens": 414238755.0, + "step": 863 + }, + { + "epoch": 0.512759643916914, + "grad_norm": 0.5650049448013306, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7556347846984863, + "num_tokens": 414712675.0, + "step": 864 + }, + { + "epoch": 0.5133531157270029, + "grad_norm": 0.5327118039131165, + "learning_rate": 1e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7450144290924072, + "num_tokens": 415201809.0, + "step": 865 + }, + { + "epoch": 0.513946587537092, + "grad_norm": 0.5479210019111633, + "learning_rate": 1e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7602115273475647, + "num_tokens": 415665972.0, + "step": 866 + }, + { + "epoch": 0.5145400593471811, + "grad_norm": 0.5282602906227112, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.757692813873291, + "num_tokens": 416127329.0, + "step": 867 + }, + { + "epoch": 0.51513353115727, + "grad_norm": 0.5356688499450684, + "learning_rate": 1e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7664016485214233, + "num_tokens": 416608343.0, + "step": 868 + }, + { + "epoch": 0.5157270029673591, + "grad_norm": 0.5232899785041809, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7586491107940674, + "num_tokens": 417097087.0, + "step": 869 + }, + { + "epoch": 0.516320474777448, + "grad_norm": 0.5706787705421448, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7436732053756714, + "num_tokens": 417537384.0, + "step": 870 + }, + { + "epoch": 0.5169139465875371, + "grad_norm": 0.5229682326316833, + "learning_rate": 1e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.761630654335022, + "num_tokens": 418048052.0, + "step": 871 + }, + { + "epoch": 0.5175074183976262, + "grad_norm": 0.5145043730735779, + "learning_rate": 1e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7781370282173157, + "num_tokens": 418533534.0, + "step": 872 + }, + { + "epoch": 0.5181008902077151, + "grad_norm": 0.5262883901596069, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7387619018554688, + "num_tokens": 419015631.0, + "step": 873 + }, + { + "epoch": 0.5186943620178042, + "grad_norm": 0.5516518950462341, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7498264312744141, + "num_tokens": 419478295.0, + "step": 874 + }, + { + "epoch": 0.5192878338278932, + "grad_norm": 0.5367767810821533, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.7679024338722229, + "num_tokens": 419948052.0, + "step": 875 + }, + { + "epoch": 0.5198813056379822, + "grad_norm": 0.56280517578125, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7589740753173828, + "num_tokens": 420398331.0, + "step": 876 + }, + { + "epoch": 0.5204747774480712, + "grad_norm": 0.5497561693191528, + "learning_rate": 1e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7498492002487183, + "num_tokens": 420851407.0, + "step": 877 + }, + { + "epoch": 0.5210682492581602, + "grad_norm": 0.5398897528648376, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.768547534942627, + "num_tokens": 421293389.0, + "step": 878 + }, + { + "epoch": 0.5216617210682493, + "grad_norm": 0.5221998691558838, + "learning_rate": 1e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.760187029838562, + "num_tokens": 421797703.0, + "step": 879 + }, + { + "epoch": 0.5222551928783383, + "grad_norm": 0.5367651581764221, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7433136105537415, + "num_tokens": 422303013.0, + "step": 880 + }, + { + "epoch": 0.5228486646884273, + "grad_norm": 0.5653073787689209, + "learning_rate": 1e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7744554281234741, + "num_tokens": 422760022.0, + "step": 881 + }, + { + "epoch": 0.5234421364985163, + "grad_norm": 0.5395925641059875, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7555522918701172, + "num_tokens": 423239591.0, + "step": 882 + }, + { + "epoch": 0.5240356083086053, + "grad_norm": 0.5362303853034973, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.762161910533905, + "num_tokens": 423734658.0, + "step": 883 + }, + { + "epoch": 0.5246290801186944, + "grad_norm": 0.5259239077568054, + "learning_rate": 1e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.763763427734375, + "num_tokens": 424218158.0, + "step": 884 + }, + { + "epoch": 0.5252225519287834, + "grad_norm": 0.5325724482536316, + "learning_rate": 1e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7506427764892578, + "num_tokens": 424688309.0, + "step": 885 + }, + { + "epoch": 0.5258160237388724, + "grad_norm": 0.5374060273170471, + "learning_rate": 1e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7636137008666992, + "num_tokens": 425180413.0, + "step": 886 + }, + { + "epoch": 0.5264094955489614, + "grad_norm": 0.589131236076355, + "learning_rate": 1e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.7630670666694641, + "num_tokens": 425632244.0, + "step": 887 + }, + { + "epoch": 0.5270029673590505, + "grad_norm": 0.5335395932197571, + "learning_rate": 1e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7595585584640503, + "num_tokens": 426125274.0, + "step": 888 + }, + { + "epoch": 0.5275964391691395, + "grad_norm": 0.5293880105018616, + "learning_rate": 1e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7445484399795532, + "num_tokens": 426611025.0, + "step": 889 + }, + { + "epoch": 0.5281899109792285, + "grad_norm": 0.5675626397132874, + "learning_rate": 1e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7473444938659668, + "num_tokens": 427046685.0, + "step": 890 + }, + { + "epoch": 0.5287833827893175, + "grad_norm": 0.5344393849372864, + "learning_rate": 1e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.7626219391822815, + "num_tokens": 427524172.0, + "step": 891 + }, + { + "epoch": 0.5293768545994065, + "grad_norm": 0.569698691368103, + "learning_rate": 1e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7569761276245117, + "num_tokens": 427969309.0, + "step": 892 + }, + { + "epoch": 0.5299703264094956, + "grad_norm": 0.6000901460647583, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7463290691375732, + "num_tokens": 428422752.0, + "step": 893 + }, + { + "epoch": 0.5305637982195845, + "grad_norm": 0.5477313995361328, + "learning_rate": 1e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7493192553520203, + "num_tokens": 428948191.0, + "step": 894 + }, + { + "epoch": 0.5311572700296736, + "grad_norm": 0.5179693102836609, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7607508897781372, + "num_tokens": 429479554.0, + "step": 895 + }, + { + "epoch": 0.5317507418397626, + "grad_norm": 0.561523973941803, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7594807744026184, + "num_tokens": 429938917.0, + "step": 896 + }, + { + "epoch": 0.5323442136498516, + "grad_norm": 0.5591444373130798, + "learning_rate": 1e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.7669091820716858, + "num_tokens": 430418678.0, + "step": 897 + }, + { + "epoch": 0.5329376854599407, + "grad_norm": 0.5391210913658142, + "learning_rate": 1e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7592368721961975, + "num_tokens": 430953166.0, + "step": 898 + }, + { + "epoch": 0.5335311572700296, + "grad_norm": 0.5578949451446533, + "learning_rate": 1e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.7471494078636169, + "num_tokens": 431412519.0, + "step": 899 + }, + { + "epoch": 0.5341246290801187, + "grad_norm": 0.5341348052024841, + "learning_rate": 1e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7726964950561523, + "num_tokens": 431922536.0, + "step": 900 + }, + { + "epoch": 0.5347181008902078, + "grad_norm": 0.5267230272293091, + "learning_rate": 1e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7499598860740662, + "num_tokens": 432428242.0, + "step": 901 + }, + { + "epoch": 0.5353115727002967, + "grad_norm": 0.5328270792961121, + "learning_rate": 1e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.749785840511322, + "num_tokens": 432932503.0, + "step": 902 + }, + { + "epoch": 0.5359050445103858, + "grad_norm": 0.5560482144355774, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7478468418121338, + "num_tokens": 433410023.0, + "step": 903 + }, + { + "epoch": 0.5364985163204747, + "grad_norm": 0.5422305464744568, + "learning_rate": 1e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.7718049883842468, + "num_tokens": 433887085.0, + "step": 904 + }, + { + "epoch": 0.5370919881305638, + "grad_norm": 0.5533691644668579, + "learning_rate": 1e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7618618011474609, + "num_tokens": 434371653.0, + "step": 905 + }, + { + "epoch": 0.5376854599406529, + "grad_norm": 0.5407783389091492, + "learning_rate": 1e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7521530389785767, + "num_tokens": 434849630.0, + "step": 906 + }, + { + "epoch": 0.5382789317507418, + "grad_norm": 0.5564159750938416, + "learning_rate": 1e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7563549280166626, + "num_tokens": 435310652.0, + "step": 907 + }, + { + "epoch": 0.5388724035608309, + "grad_norm": 0.5803408026695251, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7458709478378296, + "num_tokens": 435773126.0, + "step": 908 + }, + { + "epoch": 0.5394658753709198, + "grad_norm": 0.6030212640762329, + "learning_rate": 1e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.7574867606163025, + "num_tokens": 436215436.0, + "step": 909 + }, + { + "epoch": 0.5400593471810089, + "grad_norm": 0.5764194130897522, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7613422274589539, + "num_tokens": 436710137.0, + "step": 910 + }, + { + "epoch": 0.540652818991098, + "grad_norm": 0.5418172478675842, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7639732956886292, + "num_tokens": 437209594.0, + "step": 911 + }, + { + "epoch": 0.5412462908011869, + "grad_norm": 0.5901914834976196, + "learning_rate": 1e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.7555909752845764, + "num_tokens": 437671277.0, + "step": 912 + }, + { + "epoch": 0.541839762611276, + "grad_norm": 0.538288950920105, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7448257207870483, + "num_tokens": 438166541.0, + "step": 913 + }, + { + "epoch": 0.542433234421365, + "grad_norm": 0.5381378531455994, + "learning_rate": 1e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7474410533905029, + "num_tokens": 438681232.0, + "step": 914 + }, + { + "epoch": 0.543026706231454, + "grad_norm": 0.5334933996200562, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7606932520866394, + "num_tokens": 439211225.0, + "step": 915 + }, + { + "epoch": 0.5436201780415431, + "grad_norm": 0.5451444983482361, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7397217750549316, + "num_tokens": 439728256.0, + "step": 916 + }, + { + "epoch": 0.544213649851632, + "grad_norm": 0.6136766076087952, + "learning_rate": 1e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7545346617698669, + "num_tokens": 440130877.0, + "step": 917 + }, + { + "epoch": 0.5448071216617211, + "grad_norm": 0.5731860399246216, + "learning_rate": 1e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7595998048782349, + "num_tokens": 440571625.0, + "step": 918 + }, + { + "epoch": 0.5454005934718101, + "grad_norm": 0.5429186224937439, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7418899536132812, + "num_tokens": 441068747.0, + "step": 919 + }, + { + "epoch": 0.5459940652818991, + "grad_norm": 0.5421537756919861, + "learning_rate": 1e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7527735829353333, + "num_tokens": 441535614.0, + "step": 920 + }, + { + "epoch": 0.5465875370919882, + "grad_norm": 0.6150184273719788, + "learning_rate": 1e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7683119177818298, + "num_tokens": 442032926.0, + "step": 921 + }, + { + "epoch": 0.5471810089020771, + "grad_norm": 0.5991718173027039, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7404080033302307, + "num_tokens": 442492714.0, + "step": 922 + }, + { + "epoch": 0.5477744807121662, + "grad_norm": 0.5722286701202393, + "learning_rate": 1e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7576669454574585, + "num_tokens": 442929593.0, + "step": 923 + }, + { + "epoch": 0.5483679525222552, + "grad_norm": 0.5953434705734253, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.740953803062439, + "num_tokens": 443427695.0, + "step": 924 + }, + { + "epoch": 0.5489614243323442, + "grad_norm": 0.5854835510253906, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7534998655319214, + "num_tokens": 443881636.0, + "step": 925 + }, + { + "epoch": 0.5495548961424332, + "grad_norm": 0.5792399644851685, + "learning_rate": 1e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7419452667236328, + "num_tokens": 444310040.0, + "step": 926 + }, + { + "epoch": 0.5501483679525223, + "grad_norm": 0.6112930178642273, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7415668368339539, + "num_tokens": 444721564.0, + "step": 927 + }, + { + "epoch": 0.5507418397626113, + "grad_norm": 0.5720191597938538, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7508874535560608, + "num_tokens": 445166134.0, + "step": 928 + }, + { + "epoch": 0.5513353115727003, + "grad_norm": 0.5459293127059937, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7561644315719604, + "num_tokens": 445630644.0, + "step": 929 + }, + { + "epoch": 0.5519287833827893, + "grad_norm": 0.563673198223114, + "learning_rate": 1e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7444182634353638, + "num_tokens": 446099578.0, + "step": 930 + }, + { + "epoch": 0.5525222551928783, + "grad_norm": 0.5524821877479553, + "learning_rate": 1e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7567359209060669, + "num_tokens": 446559885.0, + "step": 931 + }, + { + "epoch": 0.5531157270029674, + "grad_norm": 0.5572383999824524, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7649646997451782, + "num_tokens": 447060758.0, + "step": 932 + }, + { + "epoch": 0.5537091988130564, + "grad_norm": 0.5345688462257385, + "learning_rate": 1e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7551721930503845, + "num_tokens": 447590683.0, + "step": 933 + }, + { + "epoch": 0.5543026706231454, + "grad_norm": 0.5444023609161377, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7503347992897034, + "num_tokens": 448078077.0, + "step": 934 + }, + { + "epoch": 0.5548961424332344, + "grad_norm": 0.5506716966629028, + "learning_rate": 1e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7549377679824829, + "num_tokens": 448565868.0, + "step": 935 + }, + { + "epoch": 0.5554896142433234, + "grad_norm": 0.5771530866622925, + "learning_rate": 1e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7409282922744751, + "num_tokens": 449037397.0, + "step": 936 + }, + { + "epoch": 0.5560830860534125, + "grad_norm": 0.526904284954071, + "learning_rate": 1e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.7611523270606995, + "num_tokens": 449552365.0, + "step": 937 + }, + { + "epoch": 0.5566765578635015, + "grad_norm": 0.5411155223846436, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7547004222869873, + "num_tokens": 450054294.0, + "step": 938 + }, + { + "epoch": 0.5572700296735905, + "grad_norm": 0.5286498665809631, + "learning_rate": 1e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7647802829742432, + "num_tokens": 450548374.0, + "step": 939 + }, + { + "epoch": 0.5578635014836796, + "grad_norm": 0.5474315285682678, + "learning_rate": 1e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.742336630821228, + "num_tokens": 451054019.0, + "step": 940 + }, + { + "epoch": 0.5584569732937685, + "grad_norm": 0.5505348443984985, + "learning_rate": 1e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.756218671798706, + "num_tokens": 451533734.0, + "step": 941 + }, + { + "epoch": 0.5590504451038576, + "grad_norm": 0.5375372171401978, + "learning_rate": 1e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7643747925758362, + "num_tokens": 451994411.0, + "step": 942 + }, + { + "epoch": 0.5596439169139465, + "grad_norm": 0.5575270056724548, + "learning_rate": 1e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7572804689407349, + "num_tokens": 452486770.0, + "step": 943 + }, + { + "epoch": 0.5602373887240356, + "grad_norm": 0.5607778429985046, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7450734376907349, + "num_tokens": 452907476.0, + "step": 944 + }, + { + "epoch": 0.5608308605341247, + "grad_norm": 0.5675557851791382, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7549402713775635, + "num_tokens": 453334208.0, + "step": 945 + }, + { + "epoch": 0.5614243323442136, + "grad_norm": 0.5730637311935425, + "learning_rate": 1e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.754497230052948, + "num_tokens": 453796276.0, + "step": 946 + }, + { + "epoch": 0.5620178041543027, + "grad_norm": 0.5664350986480713, + "learning_rate": 1e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7510865926742554, + "num_tokens": 454237686.0, + "step": 947 + }, + { + "epoch": 0.5626112759643916, + "grad_norm": 0.5559481382369995, + "learning_rate": 1e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7616692185401917, + "num_tokens": 454729612.0, + "step": 948 + }, + { + "epoch": 0.5632047477744807, + "grad_norm": 0.5361873507499695, + "learning_rate": 1e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7613271474838257, + "num_tokens": 455225527.0, + "step": 949 + }, + { + "epoch": 0.5637982195845698, + "grad_norm": 0.5507592558860779, + "learning_rate": 1e-06, + "loss": 0.7504, + "mean_token_accuracy": 0.7672473788261414, + "num_tokens": 455685772.0, + "step": 950 + }, + { + "epoch": 0.5643916913946587, + "grad_norm": 0.5342733263969421, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7601253390312195, + "num_tokens": 456181392.0, + "step": 951 + }, + { + "epoch": 0.5649851632047478, + "grad_norm": 0.5342447757720947, + "learning_rate": 1e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7677117586135864, + "num_tokens": 456673054.0, + "step": 952 + }, + { + "epoch": 0.5655786350148369, + "grad_norm": 0.5195751786231995, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7590353488922119, + "num_tokens": 457159222.0, + "step": 953 + }, + { + "epoch": 0.5661721068249258, + "grad_norm": 0.5323086977005005, + "learning_rate": 1e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7498425245285034, + "num_tokens": 457625184.0, + "step": 954 + }, + { + "epoch": 0.5667655786350149, + "grad_norm": 0.55951327085495, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7440413236618042, + "num_tokens": 458094129.0, + "step": 955 + }, + { + "epoch": 0.5673590504451038, + "grad_norm": 0.5532185435295105, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7440811395645142, + "num_tokens": 458555788.0, + "step": 956 + }, + { + "epoch": 0.5679525222551929, + "grad_norm": 0.532283365726471, + "learning_rate": 1e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7580296993255615, + "num_tokens": 459015420.0, + "step": 957 + }, + { + "epoch": 0.568545994065282, + "grad_norm": 0.5730270743370056, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7396968603134155, + "num_tokens": 459478964.0, + "step": 958 + }, + { + "epoch": 0.5691394658753709, + "grad_norm": 0.5817393660545349, + "learning_rate": 1e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.758807361125946, + "num_tokens": 459899391.0, + "step": 959 + }, + { + "epoch": 0.56973293768546, + "grad_norm": 0.560522735118866, + "learning_rate": 1e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7619248628616333, + "num_tokens": 460325346.0, + "step": 960 + }, + { + "epoch": 0.5703264094955489, + "grad_norm": 0.5347142815589905, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7504902482032776, + "num_tokens": 460804567.0, + "step": 961 + }, + { + "epoch": 0.570919881305638, + "grad_norm": 0.6067764163017273, + "learning_rate": 1e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7479256391525269, + "num_tokens": 461253427.0, + "step": 962 + }, + { + "epoch": 0.571513353115727, + "grad_norm": 0.5509281754493713, + "learning_rate": 1e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7578601241111755, + "num_tokens": 461731231.0, + "step": 963 + }, + { + "epoch": 0.572106824925816, + "grad_norm": 0.5533242225646973, + "learning_rate": 1e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.758337676525116, + "num_tokens": 462197318.0, + "step": 964 + }, + { + "epoch": 0.5727002967359051, + "grad_norm": 0.558750569820404, + "learning_rate": 1e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7620296478271484, + "num_tokens": 462668214.0, + "step": 965 + }, + { + "epoch": 0.5732937685459941, + "grad_norm": 0.5766183137893677, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7430593967437744, + "num_tokens": 463128013.0, + "step": 966 + }, + { + "epoch": 0.5738872403560831, + "grad_norm": 0.5658730268478394, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7503376603126526, + "num_tokens": 463581910.0, + "step": 967 + }, + { + "epoch": 0.5744807121661721, + "grad_norm": 0.5604237914085388, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7656127214431763, + "num_tokens": 464081587.0, + "step": 968 + }, + { + "epoch": 0.5750741839762611, + "grad_norm": 0.5502833724021912, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.758025050163269, + "num_tokens": 464553805.0, + "step": 969 + }, + { + "epoch": 0.5756676557863502, + "grad_norm": 0.5127473473548889, + "learning_rate": 1e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7665948867797852, + "num_tokens": 465071839.0, + "step": 970 + }, + { + "epoch": 0.5762611275964392, + "grad_norm": 0.5555580258369446, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7750061750411987, + "num_tokens": 465557872.0, + "step": 971 + }, + { + "epoch": 0.5768545994065282, + "grad_norm": 0.5432817935943604, + "learning_rate": 1e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7442179918289185, + "num_tokens": 466079644.0, + "step": 972 + }, + { + "epoch": 0.5774480712166172, + "grad_norm": 0.5849155187606812, + "learning_rate": 1e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.7512410879135132, + "num_tokens": 466527629.0, + "step": 973 + }, + { + "epoch": 0.5780415430267062, + "grad_norm": 0.5727483034133911, + "learning_rate": 1e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7563936710357666, + "num_tokens": 467005083.0, + "step": 974 + }, + { + "epoch": 0.5786350148367952, + "grad_norm": 0.5369318723678589, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7354838252067566, + "num_tokens": 467490967.0, + "step": 975 + }, + { + "epoch": 0.5792284866468843, + "grad_norm": 0.5432652831077576, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7651486396789551, + "num_tokens": 467954357.0, + "step": 976 + }, + { + "epoch": 0.5798219584569733, + "grad_norm": 0.6165082454681396, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7436692714691162, + "num_tokens": 468391806.0, + "step": 977 + }, + { + "epoch": 0.5804154302670623, + "grad_norm": 0.5617247223854065, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7637485265731812, + "num_tokens": 468832616.0, + "step": 978 + }, + { + "epoch": 0.5810089020771514, + "grad_norm": 0.5392219424247742, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7637937068939209, + "num_tokens": 469334877.0, + "step": 979 + }, + { + "epoch": 0.5816023738872403, + "grad_norm": 0.5823574066162109, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7483365535736084, + "num_tokens": 469790136.0, + "step": 980 + }, + { + "epoch": 0.5821958456973294, + "grad_norm": 0.5558057427406311, + "learning_rate": 1e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7634662985801697, + "num_tokens": 470261487.0, + "step": 981 + }, + { + "epoch": 0.5827893175074184, + "grad_norm": 0.5662059187889099, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7614514827728271, + "num_tokens": 470702922.0, + "step": 982 + }, + { + "epoch": 0.5833827893175074, + "grad_norm": 0.5658795237541199, + "learning_rate": 1e-06, + "loss": 0.7944, + "mean_token_accuracy": 0.7533152103424072, + "num_tokens": 471200776.0, + "step": 983 + }, + { + "epoch": 0.5839762611275965, + "grad_norm": 0.5659677982330322, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7631294131278992, + "num_tokens": 471670412.0, + "step": 984 + }, + { + "epoch": 0.5845697329376854, + "grad_norm": 0.5541038513183594, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7532142400741577, + "num_tokens": 472158499.0, + "step": 985 + }, + { + "epoch": 0.5851632047477745, + "grad_norm": 0.5584744811058044, + "learning_rate": 1e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7539833784103394, + "num_tokens": 472630074.0, + "step": 986 + }, + { + "epoch": 0.5857566765578635, + "grad_norm": 0.5780338048934937, + "learning_rate": 1e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7509476542472839, + "num_tokens": 473131057.0, + "step": 987 + }, + { + "epoch": 0.5863501483679525, + "grad_norm": 0.5528549551963806, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7547082901000977, + "num_tokens": 473628198.0, + "step": 988 + }, + { + "epoch": 0.5869436201780416, + "grad_norm": 0.5531694889068604, + "learning_rate": 1e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7614384889602661, + "num_tokens": 474120153.0, + "step": 989 + }, + { + "epoch": 0.5875370919881305, + "grad_norm": 0.566464364528656, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7614994645118713, + "num_tokens": 474577515.0, + "step": 990 + }, + { + "epoch": 0.5881305637982196, + "grad_norm": 0.5431317090988159, + "learning_rate": 1e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7519981861114502, + "num_tokens": 475072613.0, + "step": 991 + }, + { + "epoch": 0.5887240356083087, + "grad_norm": 0.5569620728492737, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7449226379394531, + "num_tokens": 475563385.0, + "step": 992 + }, + { + "epoch": 0.5893175074183976, + "grad_norm": 0.5334853529930115, + "learning_rate": 1e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7545514702796936, + "num_tokens": 476070502.0, + "step": 993 + }, + { + "epoch": 0.5899109792284867, + "grad_norm": 0.5719658732414246, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.743228554725647, + "num_tokens": 476558132.0, + "step": 994 + }, + { + "epoch": 0.5905044510385756, + "grad_norm": 0.5391477346420288, + "learning_rate": 1e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7635074257850647, + "num_tokens": 477044235.0, + "step": 995 + }, + { + "epoch": 0.5910979228486647, + "grad_norm": 0.5644832849502563, + "learning_rate": 1e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7468091249465942, + "num_tokens": 477491294.0, + "step": 996 + }, + { + "epoch": 0.5916913946587538, + "grad_norm": 0.5549913644790649, + "learning_rate": 1e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.757700502872467, + "num_tokens": 477963426.0, + "step": 997 + }, + { + "epoch": 0.5922848664688427, + "grad_norm": 0.5376408696174622, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7443233728408813, + "num_tokens": 478469709.0, + "step": 998 + }, + { + "epoch": 0.5928783382789318, + "grad_norm": 0.5414680242538452, + "learning_rate": 1e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.748085618019104, + "num_tokens": 478969447.0, + "step": 999 + }, + { + "epoch": 0.5934718100890207, + "grad_norm": 0.5649310946464539, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7409870624542236, + "num_tokens": 479412958.0, + "step": 1000 + }, + { + "epoch": 0.5940652818991098, + "grad_norm": 0.5744842886924744, + "learning_rate": 1e-06, + "loss": 0.7895, + "mean_token_accuracy": 0.7519419193267822, + "num_tokens": 479894177.0, + "step": 1001 + }, + { + "epoch": 0.5946587537091989, + "grad_norm": 0.564259946346283, + "learning_rate": 1e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7613949775695801, + "num_tokens": 480372543.0, + "step": 1002 + }, + { + "epoch": 0.5952522255192878, + "grad_norm": 0.5785170197486877, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7469285726547241, + "num_tokens": 480824257.0, + "step": 1003 + }, + { + "epoch": 0.5958456973293769, + "grad_norm": 0.5573448538780212, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7493431568145752, + "num_tokens": 481292821.0, + "step": 1004 + }, + { + "epoch": 0.5964391691394659, + "grad_norm": 0.5633206963539124, + "learning_rate": 1e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7552364468574524, + "num_tokens": 481780565.0, + "step": 1005 + }, + { + "epoch": 0.5970326409495549, + "grad_norm": 0.548689067363739, + "learning_rate": 1e-06, + "loss": 0.791, + "mean_token_accuracy": 0.755241870880127, + "num_tokens": 482238522.0, + "step": 1006 + }, + { + "epoch": 0.597626112759644, + "grad_norm": 0.5571069121360779, + "learning_rate": 1e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7664744853973389, + "num_tokens": 482719573.0, + "step": 1007 + }, + { + "epoch": 0.5982195845697329, + "grad_norm": 0.5826785564422607, + "learning_rate": 1e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7608780860900879, + "num_tokens": 483175039.0, + "step": 1008 + }, + { + "epoch": 0.598813056379822, + "grad_norm": 0.5566063523292542, + "learning_rate": 1e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7671729922294617, + "num_tokens": 483623245.0, + "step": 1009 + }, + { + "epoch": 0.599406528189911, + "grad_norm": 0.5507813692092896, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7554944753646851, + "num_tokens": 484105133.0, + "step": 1010 + }, + { + "epoch": 0.6, + "grad_norm": 0.5731980204582214, + "learning_rate": 1e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7510207891464233, + "num_tokens": 484552405.0, + "step": 1011 + }, + { + "epoch": 0.600593471810089, + "grad_norm": 0.5488348603248596, + "learning_rate": 1e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7635908722877502, + "num_tokens": 485044483.0, + "step": 1012 + }, + { + "epoch": 0.601186943620178, + "grad_norm": 0.5317894220352173, + "learning_rate": 1e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.7685600519180298, + "num_tokens": 485549577.0, + "step": 1013 + }, + { + "epoch": 0.6017804154302671, + "grad_norm": 0.5339688658714294, + "learning_rate": 1e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7548835277557373, + "num_tokens": 486047834.0, + "step": 1014 + }, + { + "epoch": 0.6023738872403561, + "grad_norm": 0.5702508091926575, + "learning_rate": 1e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7630927562713623, + "num_tokens": 486484042.0, + "step": 1015 + }, + { + "epoch": 0.6029673590504451, + "grad_norm": 0.564480721950531, + "learning_rate": 1e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7505805492401123, + "num_tokens": 486974739.0, + "step": 1016 + }, + { + "epoch": 0.6035608308605341, + "grad_norm": 0.5614084601402283, + "learning_rate": 1e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7496406435966492, + "num_tokens": 487462812.0, + "step": 1017 + }, + { + "epoch": 0.6041543026706232, + "grad_norm": 0.54329514503479, + "learning_rate": 1e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7683806419372559, + "num_tokens": 487931887.0, + "step": 1018 + }, + { + "epoch": 0.6047477744807122, + "grad_norm": 0.5625824332237244, + "learning_rate": 1e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7614861726760864, + "num_tokens": 488418341.0, + "step": 1019 + }, + { + "epoch": 0.6053412462908012, + "grad_norm": 0.535855770111084, + "learning_rate": 1e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7754782438278198, + "num_tokens": 488895487.0, + "step": 1020 + }, + { + "epoch": 0.6059347181008902, + "grad_norm": 0.5195727348327637, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7658283114433289, + "num_tokens": 489395022.0, + "step": 1021 + }, + { + "epoch": 0.6065281899109792, + "grad_norm": 0.5633530616760254, + "learning_rate": 1e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7483521699905396, + "num_tokens": 489858299.0, + "step": 1022 + }, + { + "epoch": 0.6071216617210683, + "grad_norm": 0.5471150875091553, + "learning_rate": 1e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7521762251853943, + "num_tokens": 490336851.0, + "step": 1023 + }, + { + "epoch": 0.6077151335311572, + "grad_norm": 0.5559237599372864, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7297986745834351, + "num_tokens": 490803089.0, + "step": 1024 + }, + { + "epoch": 0.6083086053412463, + "grad_norm": 0.542669415473938, + "learning_rate": 1e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7484477758407593, + "num_tokens": 491284467.0, + "step": 1025 + }, + { + "epoch": 0.6089020771513353, + "grad_norm": 0.5496013164520264, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7654154300689697, + "num_tokens": 491773106.0, + "step": 1026 + }, + { + "epoch": 0.6094955489614243, + "grad_norm": 0.5671445727348328, + "learning_rate": 1e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7638640403747559, + "num_tokens": 492264495.0, + "step": 1027 + }, + { + "epoch": 0.6100890207715134, + "grad_norm": 0.5497421026229858, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7389217615127563, + "num_tokens": 492755207.0, + "step": 1028 + }, + { + "epoch": 0.6106824925816023, + "grad_norm": 0.5263799428939819, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7634984254837036, + "num_tokens": 493259274.0, + "step": 1029 + }, + { + "epoch": 0.6112759643916914, + "grad_norm": 0.5804959535598755, + "learning_rate": 1e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.753119707107544, + "num_tokens": 493717824.0, + "step": 1030 + }, + { + "epoch": 0.6118694362017805, + "grad_norm": 0.5304261445999146, + "learning_rate": 1e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7699906826019287, + "num_tokens": 494223232.0, + "step": 1031 + }, + { + "epoch": 0.6124629080118694, + "grad_norm": 0.5291500687599182, + "learning_rate": 1e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7581942677497864, + "num_tokens": 494729298.0, + "step": 1032 + }, + { + "epoch": 0.6130563798219585, + "grad_norm": 0.5596256852149963, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.756949782371521, + "num_tokens": 495207194.0, + "step": 1033 + }, + { + "epoch": 0.6136498516320474, + "grad_norm": 0.5261120200157166, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.765047550201416, + "num_tokens": 495685739.0, + "step": 1034 + }, + { + "epoch": 0.6142433234421365, + "grad_norm": 0.5887712240219116, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7471001744270325, + "num_tokens": 496119463.0, + "step": 1035 + }, + { + "epoch": 0.6148367952522256, + "grad_norm": 0.5414243340492249, + "learning_rate": 1e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7587094306945801, + "num_tokens": 496643166.0, + "step": 1036 + }, + { + "epoch": 0.6154302670623145, + "grad_norm": 0.5685902237892151, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7414819598197937, + "num_tokens": 497083915.0, + "step": 1037 + }, + { + "epoch": 0.6160237388724036, + "grad_norm": 0.5439549088478088, + "learning_rate": 1e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7599575519561768, + "num_tokens": 497534631.0, + "step": 1038 + }, + { + "epoch": 0.6166172106824925, + "grad_norm": 0.5472052693367004, + "learning_rate": 1e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7485750913619995, + "num_tokens": 498010115.0, + "step": 1039 + }, + { + "epoch": 0.6172106824925816, + "grad_norm": 0.555454671382904, + "learning_rate": 1e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7551462650299072, + "num_tokens": 498474418.0, + "step": 1040 + }, + { + "epoch": 0.6178041543026707, + "grad_norm": 0.518955647945404, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7486373782157898, + "num_tokens": 498956080.0, + "step": 1041 + }, + { + "epoch": 0.6183976261127596, + "grad_norm": 0.5691362023353577, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7554879188537598, + "num_tokens": 499403000.0, + "step": 1042 + }, + { + "epoch": 0.6189910979228487, + "grad_norm": 0.5290747880935669, + "learning_rate": 1e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7581131458282471, + "num_tokens": 499912230.0, + "step": 1043 + }, + { + "epoch": 0.6195845697329377, + "grad_norm": 0.5467540621757507, + "learning_rate": 1e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7636780738830566, + "num_tokens": 500367884.0, + "step": 1044 + }, + { + "epoch": 0.6201780415430267, + "grad_norm": 0.5427218675613403, + "learning_rate": 1e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7562191486358643, + "num_tokens": 500845697.0, + "step": 1045 + }, + { + "epoch": 0.6207715133531158, + "grad_norm": 0.5551059246063232, + "learning_rate": 1e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7534222602844238, + "num_tokens": 501312194.0, + "step": 1046 + }, + { + "epoch": 0.6213649851632047, + "grad_norm": 0.5550471544265747, + "learning_rate": 1e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7665653228759766, + "num_tokens": 501777363.0, + "step": 1047 + }, + { + "epoch": 0.6219584569732938, + "grad_norm": 0.5270006060600281, + "learning_rate": 1e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7563890814781189, + "num_tokens": 502305333.0, + "step": 1048 + }, + { + "epoch": 0.6225519287833828, + "grad_norm": 0.5357360243797302, + "learning_rate": 1e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7564533948898315, + "num_tokens": 502758535.0, + "step": 1049 + }, + { + "epoch": 0.6231454005934718, + "grad_norm": 0.5357418060302734, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7649731636047363, + "num_tokens": 503275974.0, + "step": 1050 + }, + { + "epoch": 0.6237388724035609, + "grad_norm": 0.5427713990211487, + "learning_rate": 1e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7632479071617126, + "num_tokens": 503763379.0, + "step": 1051 + }, + { + "epoch": 0.6243323442136498, + "grad_norm": 0.5631070137023926, + "learning_rate": 1e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7517096400260925, + "num_tokens": 504178731.0, + "step": 1052 + }, + { + "epoch": 0.6249258160237389, + "grad_norm": 0.5430611968040466, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7504688501358032, + "num_tokens": 504673035.0, + "step": 1053 + }, + { + "epoch": 0.6255192878338279, + "grad_norm": 0.5442337989807129, + "learning_rate": 1e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7581026554107666, + "num_tokens": 505151005.0, + "step": 1054 + }, + { + "epoch": 0.6261127596439169, + "grad_norm": 0.5515488386154175, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7537150382995605, + "num_tokens": 505630540.0, + "step": 1055 + }, + { + "epoch": 0.626706231454006, + "grad_norm": 0.5460256934165955, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7553758025169373, + "num_tokens": 506111754.0, + "step": 1056 + }, + { + "epoch": 0.627299703264095, + "grad_norm": 0.5591485500335693, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.74336838722229, + "num_tokens": 506632984.0, + "step": 1057 + }, + { + "epoch": 0.627893175074184, + "grad_norm": 0.5287423729896545, + "learning_rate": 1e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7472642064094543, + "num_tokens": 507142986.0, + "step": 1058 + }, + { + "epoch": 0.628486646884273, + "grad_norm": 0.5351954698562622, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7386619448661804, + "num_tokens": 507632384.0, + "step": 1059 + }, + { + "epoch": 0.629080118694362, + "grad_norm": 0.5276875495910645, + "learning_rate": 1e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7739663124084473, + "num_tokens": 508134717.0, + "step": 1060 + }, + { + "epoch": 0.629673590504451, + "grad_norm": 0.5482342839241028, + "learning_rate": 1e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7545934915542603, + "num_tokens": 508577258.0, + "step": 1061 + }, + { + "epoch": 0.6302670623145401, + "grad_norm": 0.5456708669662476, + "learning_rate": 1e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7610800266265869, + "num_tokens": 509052727.0, + "step": 1062 + }, + { + "epoch": 0.6308605341246291, + "grad_norm": 0.5167871713638306, + "learning_rate": 1e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.775797963142395, + "num_tokens": 509575817.0, + "step": 1063 + }, + { + "epoch": 0.6314540059347181, + "grad_norm": 0.5322780609130859, + "learning_rate": 1e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7655402421951294, + "num_tokens": 510052455.0, + "step": 1064 + }, + { + "epoch": 0.6320474777448071, + "grad_norm": 0.5189453363418579, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7583996653556824, + "num_tokens": 510544599.0, + "step": 1065 + }, + { + "epoch": 0.6326409495548961, + "grad_norm": 0.5592953562736511, + "learning_rate": 1e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7554374933242798, + "num_tokens": 511008433.0, + "step": 1066 + }, + { + "epoch": 0.6332344213649852, + "grad_norm": 0.5595820546150208, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7646671533584595, + "num_tokens": 511476297.0, + "step": 1067 + }, + { + "epoch": 0.6338278931750742, + "grad_norm": 0.5476646423339844, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7532398700714111, + "num_tokens": 511959951.0, + "step": 1068 + }, + { + "epoch": 0.6344213649851632, + "grad_norm": 0.5645373463630676, + "learning_rate": 1e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.752507209777832, + "num_tokens": 512408401.0, + "step": 1069 + }, + { + "epoch": 0.6350148367952523, + "grad_norm": 0.5418246984481812, + "learning_rate": 1e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7607768774032593, + "num_tokens": 512903576.0, + "step": 1070 + }, + { + "epoch": 0.6356083086053412, + "grad_norm": 0.5518930554389954, + "learning_rate": 1e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7599940299987793, + "num_tokens": 513376979.0, + "step": 1071 + }, + { + "epoch": 0.6362017804154303, + "grad_norm": 0.5945699214935303, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7614403963088989, + "num_tokens": 513804420.0, + "step": 1072 + }, + { + "epoch": 0.6367952522255192, + "grad_norm": 0.5741817951202393, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7661413550376892, + "num_tokens": 514282195.0, + "step": 1073 + }, + { + "epoch": 0.6373887240356083, + "grad_norm": 0.5536941289901733, + "learning_rate": 1e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7467665076255798, + "num_tokens": 514744839.0, + "step": 1074 + }, + { + "epoch": 0.6379821958456974, + "grad_norm": 0.5474342107772827, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7375104427337646, + "num_tokens": 515227088.0, + "step": 1075 + }, + { + "epoch": 0.6385756676557863, + "grad_norm": 0.5827683806419373, + "learning_rate": 1e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7651134729385376, + "num_tokens": 515665921.0, + "step": 1076 + }, + { + "epoch": 0.6391691394658754, + "grad_norm": 0.6013103723526001, + "learning_rate": 1e-06, + "loss": 0.7803, + "mean_token_accuracy": 0.7571697235107422, + "num_tokens": 516142133.0, + "step": 1077 + }, + { + "epoch": 0.6397626112759643, + "grad_norm": 0.5684564113616943, + "learning_rate": 1e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7544491291046143, + "num_tokens": 516593337.0, + "step": 1078 + }, + { + "epoch": 0.6403560830860534, + "grad_norm": 0.5323435068130493, + "learning_rate": 1e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7524879574775696, + "num_tokens": 517072324.0, + "step": 1079 + }, + { + "epoch": 0.6409495548961425, + "grad_norm": 0.5584986805915833, + "learning_rate": 1e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7589091062545776, + "num_tokens": 517550476.0, + "step": 1080 + }, + { + "epoch": 0.6415430267062314, + "grad_norm": 0.5475960969924927, + "learning_rate": 1e-06, + "loss": 0.7621, + "mean_token_accuracy": 0.7616787552833557, + "num_tokens": 518051856.0, + "step": 1081 + }, + { + "epoch": 0.6421364985163205, + "grad_norm": 0.5365529656410217, + "learning_rate": 1e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7613685131072998, + "num_tokens": 518532688.0, + "step": 1082 + }, + { + "epoch": 0.6427299703264095, + "grad_norm": 0.5086526274681091, + "learning_rate": 1e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.7604482173919678, + "num_tokens": 519038573.0, + "step": 1083 + }, + { + "epoch": 0.6433234421364985, + "grad_norm": 0.5420550107955933, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7531721591949463, + "num_tokens": 519538150.0, + "step": 1084 + }, + { + "epoch": 0.6439169139465876, + "grad_norm": 0.5336398482322693, + "learning_rate": 1e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.761735200881958, + "num_tokens": 520016625.0, + "step": 1085 + }, + { + "epoch": 0.6445103857566765, + "grad_norm": 0.5538918375968933, + "learning_rate": 1e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7775194644927979, + "num_tokens": 520478727.0, + "step": 1086 + }, + { + "epoch": 0.6451038575667656, + "grad_norm": 0.5670120716094971, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7445745468139648, + "num_tokens": 520922669.0, + "step": 1087 + }, + { + "epoch": 0.6456973293768546, + "grad_norm": 1.7789123058319092, + "learning_rate": 1e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7735767364501953, + "num_tokens": 521433786.0, + "step": 1088 + }, + { + "epoch": 0.6462908011869436, + "grad_norm": 0.5883374214172363, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7620746493339539, + "num_tokens": 521905379.0, + "step": 1089 + }, + { + "epoch": 0.6468842729970327, + "grad_norm": 0.5863202214241028, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7411893606185913, + "num_tokens": 522384815.0, + "step": 1090 + }, + { + "epoch": 0.6474777448071216, + "grad_norm": 0.5510027408599854, + "learning_rate": 1e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7707654237747192, + "num_tokens": 522861521.0, + "step": 1091 + }, + { + "epoch": 0.6480712166172107, + "grad_norm": 0.5316334366798401, + "learning_rate": 1e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.759680986404419, + "num_tokens": 523349616.0, + "step": 1092 + }, + { + "epoch": 0.6486646884272997, + "grad_norm": 0.5603190660476685, + "learning_rate": 1e-06, + "loss": 0.7886, + "mean_token_accuracy": 0.7539002895355225, + "num_tokens": 523804062.0, + "step": 1093 + }, + { + "epoch": 0.6492581602373887, + "grad_norm": 0.5415472388267517, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.75185626745224, + "num_tokens": 524242013.0, + "step": 1094 + }, + { + "epoch": 0.6498516320474778, + "grad_norm": 0.5425652861595154, + "learning_rate": 1e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7732141613960266, + "num_tokens": 524729658.0, + "step": 1095 + }, + { + "epoch": 0.6504451038575668, + "grad_norm": 0.5739590525627136, + "learning_rate": 1e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7562305331230164, + "num_tokens": 525201209.0, + "step": 1096 + }, + { + "epoch": 0.6510385756676558, + "grad_norm": 0.5863872170448303, + "learning_rate": 1e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7598593235015869, + "num_tokens": 525647027.0, + "step": 1097 + }, + { + "epoch": 0.6516320474777448, + "grad_norm": 0.5519832968711853, + "learning_rate": 1e-06, + "loss": 0.7954, + "mean_token_accuracy": 0.7508679628372192, + "num_tokens": 526102586.0, + "step": 1098 + }, + { + "epoch": 0.6522255192878338, + "grad_norm": 0.5253292322158813, + "learning_rate": 1e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7589433193206787, + "num_tokens": 526607959.0, + "step": 1099 + }, + { + "epoch": 0.6528189910979229, + "grad_norm": 0.5930752754211426, + "learning_rate": 1e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7516932487487793, + "num_tokens": 527054406.0, + "step": 1100 + }, + { + "epoch": 0.6534124629080119, + "grad_norm": 0.5540987253189087, + "learning_rate": 1e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7708128094673157, + "num_tokens": 527532225.0, + "step": 1101 + }, + { + "epoch": 0.6540059347181009, + "grad_norm": 0.5586493015289307, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7453254461288452, + "num_tokens": 528004891.0, + "step": 1102 + }, + { + "epoch": 0.6545994065281899, + "grad_norm": 0.5489908456802368, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7442160844802856, + "num_tokens": 528473547.0, + "step": 1103 + }, + { + "epoch": 0.6551928783382789, + "grad_norm": 0.5997129678726196, + "learning_rate": 1e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.749476432800293, + "num_tokens": 528886379.0, + "step": 1104 + }, + { + "epoch": 0.655786350148368, + "grad_norm": 0.5300183892250061, + "learning_rate": 1e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.766672670841217, + "num_tokens": 529335457.0, + "step": 1105 + }, + { + "epoch": 0.656379821958457, + "grad_norm": 0.5463749170303345, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7473387718200684, + "num_tokens": 529806577.0, + "step": 1106 + }, + { + "epoch": 0.656973293768546, + "grad_norm": 0.586467981338501, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7629148364067078, + "num_tokens": 530201846.0, + "step": 1107 + }, + { + "epoch": 0.657566765578635, + "grad_norm": 0.5442957282066345, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7689090967178345, + "num_tokens": 530688185.0, + "step": 1108 + }, + { + "epoch": 0.6581602373887241, + "grad_norm": 0.5789203643798828, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7521878480911255, + "num_tokens": 531127727.0, + "step": 1109 + }, + { + "epoch": 0.658753709198813, + "grad_norm": 0.5400189757347107, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.767385721206665, + "num_tokens": 531609677.0, + "step": 1110 + }, + { + "epoch": 0.6593471810089021, + "grad_norm": 0.533588707447052, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7712597250938416, + "num_tokens": 532097888.0, + "step": 1111 + }, + { + "epoch": 0.6599406528189911, + "grad_norm": 0.5632157325744629, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7497409582138062, + "num_tokens": 532593392.0, + "step": 1112 + }, + { + "epoch": 0.6605341246290801, + "grad_norm": 0.5319656133651733, + "learning_rate": 1e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7548034191131592, + "num_tokens": 533142950.0, + "step": 1113 + }, + { + "epoch": 0.6611275964391692, + "grad_norm": 0.5839518904685974, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7706241011619568, + "num_tokens": 533614075.0, + "step": 1114 + }, + { + "epoch": 0.6617210682492581, + "grad_norm": 0.5725891590118408, + "learning_rate": 1e-06, + "loss": 0.7895, + "mean_token_accuracy": 0.7541035413742065, + "num_tokens": 534056526.0, + "step": 1115 + }, + { + "epoch": 0.6623145400593472, + "grad_norm": 0.5369434356689453, + "learning_rate": 1e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7511076927185059, + "num_tokens": 534540837.0, + "step": 1116 + }, + { + "epoch": 0.6629080118694362, + "grad_norm": 0.5402957201004028, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7528309226036072, + "num_tokens": 535032639.0, + "step": 1117 + }, + { + "epoch": 0.6635014836795252, + "grad_norm": 0.5313919186592102, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7496205568313599, + "num_tokens": 535518124.0, + "step": 1118 + }, + { + "epoch": 0.6640949554896143, + "grad_norm": 0.541840136051178, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7447898387908936, + "num_tokens": 536067649.0, + "step": 1119 + }, + { + "epoch": 0.6646884272997032, + "grad_norm": 0.5686322450637817, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7423270344734192, + "num_tokens": 536526804.0, + "step": 1120 + }, + { + "epoch": 0.6652818991097923, + "grad_norm": 0.6045002341270447, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.744159460067749, + "num_tokens": 537015214.0, + "step": 1121 + }, + { + "epoch": 0.6658753709198814, + "grad_norm": 0.5724703669548035, + "learning_rate": 1e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7522209882736206, + "num_tokens": 537497660.0, + "step": 1122 + }, + { + "epoch": 0.6664688427299703, + "grad_norm": 0.5659765601158142, + "learning_rate": 1e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7541646957397461, + "num_tokens": 537983504.0, + "step": 1123 + }, + { + "epoch": 0.6670623145400594, + "grad_norm": 0.5887967944145203, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7467391490936279, + "num_tokens": 538477795.0, + "step": 1124 + }, + { + "epoch": 0.6676557863501483, + "grad_norm": 0.5963385105133057, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.756980299949646, + "num_tokens": 538937581.0, + "step": 1125 + }, + { + "epoch": 0.6682492581602374, + "grad_norm": 0.554270327091217, + "learning_rate": 1e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7564516067504883, + "num_tokens": 539417678.0, + "step": 1126 + }, + { + "epoch": 0.6688427299703265, + "grad_norm": 0.5617198348045349, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7622272968292236, + "num_tokens": 539891213.0, + "step": 1127 + }, + { + "epoch": 0.6694362017804154, + "grad_norm": 0.5432695150375366, + "learning_rate": 1e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7583418488502502, + "num_tokens": 540419682.0, + "step": 1128 + }, + { + "epoch": 0.6700296735905045, + "grad_norm": 0.5348734855651855, + "learning_rate": 1e-06, + "loss": 0.7781, + "mean_token_accuracy": 0.7570290565490723, + "num_tokens": 540926155.0, + "step": 1129 + }, + { + "epoch": 0.6706231454005934, + "grad_norm": 0.5757626295089722, + "learning_rate": 1e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7722423076629639, + "num_tokens": 541401647.0, + "step": 1130 + }, + { + "epoch": 0.6712166172106825, + "grad_norm": 0.5616686344146729, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7547754645347595, + "num_tokens": 541853319.0, + "step": 1131 + }, + { + "epoch": 0.6718100890207716, + "grad_norm": 0.5819153189659119, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7404221296310425, + "num_tokens": 542320372.0, + "step": 1132 + }, + { + "epoch": 0.6724035608308605, + "grad_norm": 0.6069421172142029, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7423153519630432, + "num_tokens": 542759907.0, + "step": 1133 + }, + { + "epoch": 0.6729970326409496, + "grad_norm": 0.5889408588409424, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7710101008415222, + "num_tokens": 543232939.0, + "step": 1134 + }, + { + "epoch": 0.6735905044510386, + "grad_norm": 0.5433868765830994, + "learning_rate": 1e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7665308117866516, + "num_tokens": 543674912.0, + "step": 1135 + }, + { + "epoch": 0.6741839762611276, + "grad_norm": 0.5243349075317383, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.745868444442749, + "num_tokens": 544169722.0, + "step": 1136 + }, + { + "epoch": 0.6747774480712166, + "grad_norm": 0.5759857296943665, + "learning_rate": 1e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.754747748374939, + "num_tokens": 544641268.0, + "step": 1137 + }, + { + "epoch": 0.6753709198813056, + "grad_norm": 0.5780336856842041, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7586073279380798, + "num_tokens": 545106352.0, + "step": 1138 + }, + { + "epoch": 0.6759643916913947, + "grad_norm": 0.5394971966743469, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7479681968688965, + "num_tokens": 545580352.0, + "step": 1139 + }, + { + "epoch": 0.6765578635014837, + "grad_norm": 0.5566816926002502, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7621606588363647, + "num_tokens": 546069341.0, + "step": 1140 + }, + { + "epoch": 0.6771513353115727, + "grad_norm": 0.6106308698654175, + "learning_rate": 1e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7732540965080261, + "num_tokens": 546521646.0, + "step": 1141 + }, + { + "epoch": 0.6777448071216617, + "grad_norm": 0.5586240291595459, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7599822878837585, + "num_tokens": 547006786.0, + "step": 1142 + }, + { + "epoch": 0.6783382789317507, + "grad_norm": 0.5352938771247864, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7721728086471558, + "num_tokens": 547486756.0, + "step": 1143 + }, + { + "epoch": 0.6789317507418398, + "grad_norm": 0.5900688767433167, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7582769393920898, + "num_tokens": 547984998.0, + "step": 1144 + }, + { + "epoch": 0.6795252225519288, + "grad_norm": 0.5706266164779663, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7418028116226196, + "num_tokens": 548425329.0, + "step": 1145 + }, + { + "epoch": 0.6801186943620178, + "grad_norm": 0.5280845165252686, + "learning_rate": 1e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.7571172714233398, + "num_tokens": 548893564.0, + "step": 1146 + }, + { + "epoch": 0.6807121661721068, + "grad_norm": 0.5504260659217834, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7411454319953918, + "num_tokens": 549394736.0, + "step": 1147 + }, + { + "epoch": 0.6813056379821959, + "grad_norm": 0.6023343205451965, + "learning_rate": 1e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.747904896736145, + "num_tokens": 549855949.0, + "step": 1148 + }, + { + "epoch": 0.6818991097922849, + "grad_norm": 0.5331653356552124, + "learning_rate": 1e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7791095972061157, + "num_tokens": 550336353.0, + "step": 1149 + }, + { + "epoch": 0.6824925816023739, + "grad_norm": 0.5707616209983826, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7386894822120667, + "num_tokens": 550790261.0, + "step": 1150 + }, + { + "epoch": 0.6830860534124629, + "grad_norm": 0.5341805815696716, + "learning_rate": 1e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7531527280807495, + "num_tokens": 551279671.0, + "step": 1151 + }, + { + "epoch": 0.6836795252225519, + "grad_norm": 0.593071460723877, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7389765977859497, + "num_tokens": 551711233.0, + "step": 1152 + }, + { + "epoch": 0.684272997032641, + "grad_norm": 0.5491460561752319, + "learning_rate": 1e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7601798176765442, + "num_tokens": 552192179.0, + "step": 1153 + }, + { + "epoch": 0.68486646884273, + "grad_norm": 0.5801764726638794, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7445427179336548, + "num_tokens": 552643821.0, + "step": 1154 + }, + { + "epoch": 0.685459940652819, + "grad_norm": 0.532228410243988, + "learning_rate": 1e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.76413893699646, + "num_tokens": 553138108.0, + "step": 1155 + }, + { + "epoch": 0.686053412462908, + "grad_norm": 0.5616974830627441, + "learning_rate": 1e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7653659582138062, + "num_tokens": 553553388.0, + "step": 1156 + }, + { + "epoch": 0.686646884272997, + "grad_norm": 0.567577600479126, + "learning_rate": 1e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7600142955780029, + "num_tokens": 554013928.0, + "step": 1157 + }, + { + "epoch": 0.6872403560830861, + "grad_norm": 0.5653443932533264, + "learning_rate": 1e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7574081420898438, + "num_tokens": 554499309.0, + "step": 1158 + }, + { + "epoch": 0.687833827893175, + "grad_norm": 0.5677942037582397, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7552831172943115, + "num_tokens": 554993043.0, + "step": 1159 + }, + { + "epoch": 0.6884272997032641, + "grad_norm": 0.5874456167221069, + "learning_rate": 1e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7601772546768188, + "num_tokens": 555442100.0, + "step": 1160 + }, + { + "epoch": 0.6890207715133532, + "grad_norm": 0.573520302772522, + "learning_rate": 1e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7617177963256836, + "num_tokens": 555910225.0, + "step": 1161 + }, + { + "epoch": 0.6896142433234421, + "grad_norm": 0.5768938064575195, + "learning_rate": 1e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7520359754562378, + "num_tokens": 556351591.0, + "step": 1162 + }, + { + "epoch": 0.6902077151335312, + "grad_norm": 0.5424017906188965, + "learning_rate": 1e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7616515159606934, + "num_tokens": 556795110.0, + "step": 1163 + }, + { + "epoch": 0.6908011869436201, + "grad_norm": 0.603423535823822, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7509706020355225, + "num_tokens": 557249180.0, + "step": 1164 + }, + { + "epoch": 0.6913946587537092, + "grad_norm": 0.574256181716919, + "learning_rate": 1e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7646854519844055, + "num_tokens": 557704412.0, + "step": 1165 + }, + { + "epoch": 0.6919881305637983, + "grad_norm": 0.5578528046607971, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7643206119537354, + "num_tokens": 558150913.0, + "step": 1166 + }, + { + "epoch": 0.6925816023738872, + "grad_norm": 0.5804410576820374, + "learning_rate": 1e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7504011988639832, + "num_tokens": 558618862.0, + "step": 1167 + }, + { + "epoch": 0.6931750741839763, + "grad_norm": 0.5340591669082642, + "learning_rate": 1e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7527908682823181, + "num_tokens": 559116216.0, + "step": 1168 + }, + { + "epoch": 0.6937685459940652, + "grad_norm": 0.5084401965141296, + "learning_rate": 1e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7688837051391602, + "num_tokens": 559616060.0, + "step": 1169 + }, + { + "epoch": 0.6943620178041543, + "grad_norm": 0.5956255793571472, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7351157665252686, + "num_tokens": 560058888.0, + "step": 1170 + }, + { + "epoch": 0.6949554896142434, + "grad_norm": 0.5624662041664124, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7489094734191895, + "num_tokens": 560550369.0, + "step": 1171 + }, + { + "epoch": 0.6955489614243323, + "grad_norm": 0.5512381196022034, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7608116865158081, + "num_tokens": 561036961.0, + "step": 1172 + }, + { + "epoch": 0.6961424332344214, + "grad_norm": 0.5720783472061157, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7463370561599731, + "num_tokens": 561530541.0, + "step": 1173 + }, + { + "epoch": 0.6967359050445104, + "grad_norm": 0.5798247456550598, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7557215690612793, + "num_tokens": 562008056.0, + "step": 1174 + }, + { + "epoch": 0.6973293768545994, + "grad_norm": 0.526452362537384, + "learning_rate": 1e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7674695253372192, + "num_tokens": 562503373.0, + "step": 1175 + }, + { + "epoch": 0.6979228486646885, + "grad_norm": 0.5536531805992126, + "learning_rate": 1e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7663732767105103, + "num_tokens": 562959441.0, + "step": 1176 + }, + { + "epoch": 0.6985163204747774, + "grad_norm": 0.5508091449737549, + "learning_rate": 1e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.766901969909668, + "num_tokens": 563419339.0, + "step": 1177 + }, + { + "epoch": 0.6991097922848665, + "grad_norm": 0.5243277549743652, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7720530033111572, + "num_tokens": 563911908.0, + "step": 1178 + }, + { + "epoch": 0.6997032640949555, + "grad_norm": 0.5457372069358826, + "learning_rate": 1e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.761226236820221, + "num_tokens": 564396408.0, + "step": 1179 + }, + { + "epoch": 0.7002967359050445, + "grad_norm": 0.5161446928977966, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7653247117996216, + "num_tokens": 564910073.0, + "step": 1180 + }, + { + "epoch": 0.7008902077151336, + "grad_norm": 0.5457298755645752, + "learning_rate": 1e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7586382031440735, + "num_tokens": 565363310.0, + "step": 1181 + }, + { + "epoch": 0.7014836795252225, + "grad_norm": 0.5271750092506409, + "learning_rate": 1e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7702473998069763, + "num_tokens": 565844310.0, + "step": 1182 + }, + { + "epoch": 0.7020771513353116, + "grad_norm": 0.5296856760978699, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7625954151153564, + "num_tokens": 566339097.0, + "step": 1183 + }, + { + "epoch": 0.7026706231454006, + "grad_norm": 0.536748468875885, + "learning_rate": 1e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7588057518005371, + "num_tokens": 566873617.0, + "step": 1184 + }, + { + "epoch": 0.7032640949554896, + "grad_norm": 0.5425625443458557, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7415745854377747, + "num_tokens": 567344073.0, + "step": 1185 + }, + { + "epoch": 0.7038575667655786, + "grad_norm": 0.5358926057815552, + "learning_rate": 1e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7619732618331909, + "num_tokens": 567842234.0, + "step": 1186 + }, + { + "epoch": 0.7044510385756677, + "grad_norm": 0.5478368401527405, + "learning_rate": 1e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7548802495002747, + "num_tokens": 568309514.0, + "step": 1187 + }, + { + "epoch": 0.7050445103857567, + "grad_norm": 0.525886058807373, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7583913803100586, + "num_tokens": 568789524.0, + "step": 1188 + }, + { + "epoch": 0.7056379821958457, + "grad_norm": 0.5556285381317139, + "learning_rate": 1e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7487273216247559, + "num_tokens": 569264852.0, + "step": 1189 + }, + { + "epoch": 0.7062314540059347, + "grad_norm": 0.5566415190696716, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7339919209480286, + "num_tokens": 569704863.0, + "step": 1190 + }, + { + "epoch": 0.7068249258160237, + "grad_norm": 0.5297325849533081, + "learning_rate": 1e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7590423226356506, + "num_tokens": 570194866.0, + "step": 1191 + }, + { + "epoch": 0.7074183976261128, + "grad_norm": 0.5279698371887207, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7624220848083496, + "num_tokens": 570679154.0, + "step": 1192 + }, + { + "epoch": 0.7080118694362018, + "grad_norm": 0.5385937094688416, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7629587650299072, + "num_tokens": 571162450.0, + "step": 1193 + }, + { + "epoch": 0.7086053412462908, + "grad_norm": 0.5447594523429871, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7539434432983398, + "num_tokens": 571656333.0, + "step": 1194 + }, + { + "epoch": 0.7091988130563798, + "grad_norm": 0.5410736799240112, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.746070384979248, + "num_tokens": 572147616.0, + "step": 1195 + }, + { + "epoch": 0.7097922848664688, + "grad_norm": 0.49299493432044983, + "learning_rate": 1e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7539246678352356, + "num_tokens": 572692498.0, + "step": 1196 + }, + { + "epoch": 0.7103857566765579, + "grad_norm": 0.5480455160140991, + "learning_rate": 1e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7622307538986206, + "num_tokens": 573162905.0, + "step": 1197 + }, + { + "epoch": 0.7109792284866469, + "grad_norm": 0.511727511882782, + "learning_rate": 1e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7591723203659058, + "num_tokens": 573674341.0, + "step": 1198 + }, + { + "epoch": 0.7115727002967359, + "grad_norm": 0.5453901290893555, + "learning_rate": 1e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7534729838371277, + "num_tokens": 574135222.0, + "step": 1199 + }, + { + "epoch": 0.712166172106825, + "grad_norm": 0.5410047173500061, + "learning_rate": 1e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.757415235042572, + "num_tokens": 574637275.0, + "step": 1200 + }, + { + "epoch": 0.7127596439169139, + "grad_norm": 0.5133479237556458, + "learning_rate": 1e-06, + "loss": 0.6766, + "mean_token_accuracy": 0.7841988801956177, + "num_tokens": 575151279.0, + "step": 1201 + }, + { + "epoch": 0.713353115727003, + "grad_norm": 0.5413455367088318, + "learning_rate": 1e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7648228406906128, + "num_tokens": 575644425.0, + "step": 1202 + }, + { + "epoch": 0.713946587537092, + "grad_norm": 0.5835498571395874, + "learning_rate": 1e-06, + "loss": 0.7821, + "mean_token_accuracy": 0.7567095756530762, + "num_tokens": 576138294.0, + "step": 1203 + }, + { + "epoch": 0.714540059347181, + "grad_norm": 0.5348433256149292, + "learning_rate": 1e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7681338787078857, + "num_tokens": 576620169.0, + "step": 1204 + }, + { + "epoch": 0.7151335311572701, + "grad_norm": 0.5649370551109314, + "learning_rate": 1e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.7596918344497681, + "num_tokens": 577124943.0, + "step": 1205 + }, + { + "epoch": 0.715727002967359, + "grad_norm": 0.5820286273956299, + "learning_rate": 1e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7615686655044556, + "num_tokens": 577582023.0, + "step": 1206 + }, + { + "epoch": 0.7163204747774481, + "grad_norm": 0.5529128313064575, + "learning_rate": 1e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7700101137161255, + "num_tokens": 578054423.0, + "step": 1207 + }, + { + "epoch": 0.716913946587537, + "grad_norm": 0.5744956135749817, + "learning_rate": 1e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7501598596572876, + "num_tokens": 578507877.0, + "step": 1208 + }, + { + "epoch": 0.7175074183976261, + "grad_norm": 0.542460560798645, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7584662437438965, + "num_tokens": 579004974.0, + "step": 1209 + }, + { + "epoch": 0.7181008902077152, + "grad_norm": 0.5453004837036133, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7585228085517883, + "num_tokens": 579519037.0, + "step": 1210 + }, + { + "epoch": 0.7186943620178041, + "grad_norm": 0.526505172252655, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7605125308036804, + "num_tokens": 580016222.0, + "step": 1211 + }, + { + "epoch": 0.7192878338278932, + "grad_norm": 0.5559287667274475, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7598874568939209, + "num_tokens": 580487535.0, + "step": 1212 + }, + { + "epoch": 0.7198813056379822, + "grad_norm": 0.5716094970703125, + "learning_rate": 1e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7720000743865967, + "num_tokens": 580939561.0, + "step": 1213 + }, + { + "epoch": 0.7204747774480712, + "grad_norm": 0.5276507139205933, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.759251594543457, + "num_tokens": 581460256.0, + "step": 1214 + }, + { + "epoch": 0.7210682492581603, + "grad_norm": 0.5452361702919006, + "learning_rate": 1e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7676224708557129, + "num_tokens": 581919406.0, + "step": 1215 + }, + { + "epoch": 0.7216617210682492, + "grad_norm": 0.5366619825363159, + "learning_rate": 1e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.772537887096405, + "num_tokens": 582388267.0, + "step": 1216 + }, + { + "epoch": 0.7222551928783383, + "grad_norm": 0.5447930693626404, + "learning_rate": 1e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.7587323784828186, + "num_tokens": 582866029.0, + "step": 1217 + }, + { + "epoch": 0.7228486646884273, + "grad_norm": 0.5211935639381409, + "learning_rate": 1e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.7636154890060425, + "num_tokens": 583387481.0, + "step": 1218 + }, + { + "epoch": 0.7234421364985163, + "grad_norm": 0.5521057844161987, + "learning_rate": 1e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7593264579772949, + "num_tokens": 583871518.0, + "step": 1219 + }, + { + "epoch": 0.7240356083086054, + "grad_norm": 0.5549367070198059, + "learning_rate": 1e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7540911436080933, + "num_tokens": 584336261.0, + "step": 1220 + }, + { + "epoch": 0.7246290801186943, + "grad_norm": 0.5364219546318054, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7639095187187195, + "num_tokens": 584799854.0, + "step": 1221 + }, + { + "epoch": 0.7252225519287834, + "grad_norm": 0.5282636284828186, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7626916766166687, + "num_tokens": 585282687.0, + "step": 1222 + }, + { + "epoch": 0.7258160237388724, + "grad_norm": 0.5525956749916077, + "learning_rate": 1e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7578067779541016, + "num_tokens": 585741525.0, + "step": 1223 + }, + { + "epoch": 0.7264094955489614, + "grad_norm": 0.5957592725753784, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7522852420806885, + "num_tokens": 586197486.0, + "step": 1224 + }, + { + "epoch": 0.7270029673590505, + "grad_norm": 0.5279615521430969, + "learning_rate": 1e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7651942372322083, + "num_tokens": 586687409.0, + "step": 1225 + }, + { + "epoch": 0.7275964391691395, + "grad_norm": 0.5279904007911682, + "learning_rate": 1e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.750141441822052, + "num_tokens": 587189791.0, + "step": 1226 + }, + { + "epoch": 0.7281899109792285, + "grad_norm": 0.5426216721534729, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7615196704864502, + "num_tokens": 587667879.0, + "step": 1227 + }, + { + "epoch": 0.7287833827893175, + "grad_norm": 0.6011440753936768, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7662267684936523, + "num_tokens": 588124547.0, + "step": 1228 + }, + { + "epoch": 0.7293768545994065, + "grad_norm": 0.558251142501831, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7563974857330322, + "num_tokens": 588637260.0, + "step": 1229 + }, + { + "epoch": 0.7299703264094956, + "grad_norm": 0.5393770337104797, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7545849084854126, + "num_tokens": 589123959.0, + "step": 1230 + }, + { + "epoch": 0.7305637982195846, + "grad_norm": 0.5916358828544617, + "learning_rate": 1e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7610774040222168, + "num_tokens": 589577534.0, + "step": 1231 + }, + { + "epoch": 0.7311572700296736, + "grad_norm": 0.5784099698066711, + "learning_rate": 1e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.754104733467102, + "num_tokens": 590081825.0, + "step": 1232 + }, + { + "epoch": 0.7317507418397626, + "grad_norm": 0.53337162733078, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7569857835769653, + "num_tokens": 590617787.0, + "step": 1233 + }, + { + "epoch": 0.7323442136498516, + "grad_norm": 0.5296884179115295, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7628138065338135, + "num_tokens": 591096076.0, + "step": 1234 + }, + { + "epoch": 0.7329376854599406, + "grad_norm": 0.54530268907547, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7568345069885254, + "num_tokens": 591549951.0, + "step": 1235 + }, + { + "epoch": 0.7335311572700297, + "grad_norm": 0.5920829772949219, + "learning_rate": 1e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7634822130203247, + "num_tokens": 592019874.0, + "step": 1236 + }, + { + "epoch": 0.7341246290801187, + "grad_norm": 0.6051641702651978, + "learning_rate": 1e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.757521390914917, + "num_tokens": 592444990.0, + "step": 1237 + }, + { + "epoch": 0.7347181008902077, + "grad_norm": 0.5259174108505249, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7684749364852905, + "num_tokens": 592939020.0, + "step": 1238 + }, + { + "epoch": 0.7353115727002968, + "grad_norm": 0.505032479763031, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7682197690010071, + "num_tokens": 593446158.0, + "step": 1239 + }, + { + "epoch": 0.7359050445103857, + "grad_norm": 0.5950748324394226, + "learning_rate": 1e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7549042701721191, + "num_tokens": 593898553.0, + "step": 1240 + }, + { + "epoch": 0.7364985163204748, + "grad_norm": 0.5193106532096863, + "learning_rate": 1e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7658169865608215, + "num_tokens": 594422650.0, + "step": 1241 + }, + { + "epoch": 0.7370919881305638, + "grad_norm": 0.5850328207015991, + "learning_rate": 1e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7597079873085022, + "num_tokens": 594867509.0, + "step": 1242 + }, + { + "epoch": 0.7376854599406528, + "grad_norm": 0.5597022771835327, + "learning_rate": 1e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.7490900754928589, + "num_tokens": 595321039.0, + "step": 1243 + }, + { + "epoch": 0.7382789317507419, + "grad_norm": 0.5680525302886963, + "learning_rate": 1e-06, + "loss": 0.7547, + "mean_token_accuracy": 0.7646342515945435, + "num_tokens": 595782349.0, + "step": 1244 + }, + { + "epoch": 0.7388724035608308, + "grad_norm": 0.5217905044555664, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7700110673904419, + "num_tokens": 596276242.0, + "step": 1245 + }, + { + "epoch": 0.7394658753709199, + "grad_norm": 0.5302740335464478, + "learning_rate": 1e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7639691829681396, + "num_tokens": 596762860.0, + "step": 1246 + }, + { + "epoch": 0.7400593471810089, + "grad_norm": 0.6095273494720459, + "learning_rate": 1e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.7471937537193298, + "num_tokens": 597206022.0, + "step": 1247 + }, + { + "epoch": 0.7406528189910979, + "grad_norm": 0.5519376993179321, + "learning_rate": 1e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7690911889076233, + "num_tokens": 597679913.0, + "step": 1248 + }, + { + "epoch": 0.741246290801187, + "grad_norm": 0.5251342058181763, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7537987232208252, + "num_tokens": 598189274.0, + "step": 1249 + }, + { + "epoch": 0.7418397626112759, + "grad_norm": 0.5351078510284424, + "learning_rate": 1e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7555697560310364, + "num_tokens": 598703990.0, + "step": 1250 + }, + { + "epoch": 0.742433234421365, + "grad_norm": 0.5670009255409241, + "learning_rate": 1e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7548447251319885, + "num_tokens": 599162424.0, + "step": 1251 + }, + { + "epoch": 0.7430267062314541, + "grad_norm": 0.5497141480445862, + "learning_rate": 1e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7603181600570679, + "num_tokens": 599617573.0, + "step": 1252 + }, + { + "epoch": 0.743620178041543, + "grad_norm": 0.534025251865387, + "learning_rate": 1e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7611511945724487, + "num_tokens": 600144817.0, + "step": 1253 + }, + { + "epoch": 0.7442136498516321, + "grad_norm": 0.520912230014801, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7572778463363647, + "num_tokens": 600658825.0, + "step": 1254 + }, + { + "epoch": 0.744807121661721, + "grad_norm": 0.5292118191719055, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7499262094497681, + "num_tokens": 601147608.0, + "step": 1255 + }, + { + "epoch": 0.7454005934718101, + "grad_norm": 0.5546944737434387, + "learning_rate": 1e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.7582240104675293, + "num_tokens": 601636102.0, + "step": 1256 + }, + { + "epoch": 0.7459940652818992, + "grad_norm": 0.5658095479011536, + "learning_rate": 1e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.751741349697113, + "num_tokens": 602107797.0, + "step": 1257 + }, + { + "epoch": 0.7465875370919881, + "grad_norm": 0.5714346766471863, + "learning_rate": 1e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7608079314231873, + "num_tokens": 602595422.0, + "step": 1258 + }, + { + "epoch": 0.7471810089020772, + "grad_norm": 0.5730372667312622, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7700545191764832, + "num_tokens": 603032912.0, + "step": 1259 + }, + { + "epoch": 0.7477744807121661, + "grad_norm": 0.5419037342071533, + "learning_rate": 1e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7526615858078003, + "num_tokens": 603525350.0, + "step": 1260 + }, + { + "epoch": 0.7483679525222552, + "grad_norm": 0.5566868782043457, + "learning_rate": 1e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7563493847846985, + "num_tokens": 604007311.0, + "step": 1261 + }, + { + "epoch": 0.7489614243323442, + "grad_norm": 0.5453252196311951, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7611075639724731, + "num_tokens": 604530236.0, + "step": 1262 + }, + { + "epoch": 0.7495548961424332, + "grad_norm": 0.5417011380195618, + "learning_rate": 1e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7530782222747803, + "num_tokens": 604983342.0, + "step": 1263 + }, + { + "epoch": 0.7501483679525223, + "grad_norm": 0.5369787812232971, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7726542353630066, + "num_tokens": 605432830.0, + "step": 1264 + }, + { + "epoch": 0.7507418397626113, + "grad_norm": 0.5993219614028931, + "learning_rate": 1e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7591019868850708, + "num_tokens": 605883440.0, + "step": 1265 + }, + { + "epoch": 0.7513353115727003, + "grad_norm": 0.565676748752594, + "learning_rate": 1e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.774996280670166, + "num_tokens": 606350140.0, + "step": 1266 + }, + { + "epoch": 0.7519287833827893, + "grad_norm": 0.5045488476753235, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.771600067615509, + "num_tokens": 606853629.0, + "step": 1267 + }, + { + "epoch": 0.7525222551928783, + "grad_norm": 0.558651864528656, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7495545148849487, + "num_tokens": 607313661.0, + "step": 1268 + }, + { + "epoch": 0.7531157270029674, + "grad_norm": 0.4987238645553589, + "learning_rate": 1e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7750920057296753, + "num_tokens": 607827331.0, + "step": 1269 + }, + { + "epoch": 0.7537091988130564, + "grad_norm": 0.6652017831802368, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7703602910041809, + "num_tokens": 608335239.0, + "step": 1270 + }, + { + "epoch": 0.7543026706231454, + "grad_norm": 0.5362972617149353, + "learning_rate": 1e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7562574744224548, + "num_tokens": 608800964.0, + "step": 1271 + }, + { + "epoch": 0.7548961424332344, + "grad_norm": 0.5045185089111328, + "learning_rate": 1e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7606598138809204, + "num_tokens": 609329555.0, + "step": 1272 + }, + { + "epoch": 0.7554896142433234, + "grad_norm": 0.5328187346458435, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7483664751052856, + "num_tokens": 609798304.0, + "step": 1273 + }, + { + "epoch": 0.7560830860534125, + "grad_norm": 0.5308432579040527, + "learning_rate": 1e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7707761526107788, + "num_tokens": 610312736.0, + "step": 1274 + }, + { + "epoch": 0.7566765578635015, + "grad_norm": 0.5336160063743591, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7642308473587036, + "num_tokens": 610805608.0, + "step": 1275 + }, + { + "epoch": 0.7572700296735905, + "grad_norm": 0.5392115712165833, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7658028602600098, + "num_tokens": 611272258.0, + "step": 1276 + }, + { + "epoch": 0.7578635014836795, + "grad_norm": 0.5442535281181335, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.760533332824707, + "num_tokens": 611759953.0, + "step": 1277 + }, + { + "epoch": 0.7584569732937686, + "grad_norm": 0.5626965761184692, + "learning_rate": 1e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7419655919075012, + "num_tokens": 612254826.0, + "step": 1278 + }, + { + "epoch": 0.7590504451038576, + "grad_norm": 0.5193665623664856, + "learning_rate": 1e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7698767185211182, + "num_tokens": 612786518.0, + "step": 1279 + }, + { + "epoch": 0.7596439169139466, + "grad_norm": 0.5387042164802551, + "learning_rate": 1e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7622362375259399, + "num_tokens": 613271431.0, + "step": 1280 + }, + { + "epoch": 0.7602373887240356, + "grad_norm": 0.5755212306976318, + "learning_rate": 1e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7538747787475586, + "num_tokens": 613732038.0, + "step": 1281 + }, + { + "epoch": 0.7608308605341246, + "grad_norm": 0.5216104388237, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7696459889411926, + "num_tokens": 614227372.0, + "step": 1282 + }, + { + "epoch": 0.7614243323442137, + "grad_norm": 0.5041214227676392, + "learning_rate": 1e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7503492832183838, + "num_tokens": 614767247.0, + "step": 1283 + }, + { + "epoch": 0.7620178041543026, + "grad_norm": 0.552785336971283, + "learning_rate": 1e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7552711367607117, + "num_tokens": 615246655.0, + "step": 1284 + }, + { + "epoch": 0.7626112759643917, + "grad_norm": 0.5200415253639221, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.763901948928833, + "num_tokens": 615775123.0, + "step": 1285 + }, + { + "epoch": 0.7632047477744807, + "grad_norm": 0.5183860063552856, + "learning_rate": 1e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.7478324174880981, + "num_tokens": 616294318.0, + "step": 1286 + }, + { + "epoch": 0.7637982195845697, + "grad_norm": 0.5574998259544373, + "learning_rate": 1e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7484427690505981, + "num_tokens": 616772249.0, + "step": 1287 + }, + { + "epoch": 0.7643916913946588, + "grad_norm": 0.5141788721084595, + "learning_rate": 1e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7613083124160767, + "num_tokens": 617293988.0, + "step": 1288 + }, + { + "epoch": 0.7649851632047477, + "grad_norm": 0.5134835839271545, + "learning_rate": 1e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7574400305747986, + "num_tokens": 617772033.0, + "step": 1289 + }, + { + "epoch": 0.7655786350148368, + "grad_norm": 0.5740748047828674, + "learning_rate": 1e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7637384533882141, + "num_tokens": 618219860.0, + "step": 1290 + }, + { + "epoch": 0.7661721068249259, + "grad_norm": 0.5728841423988342, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7567113041877747, + "num_tokens": 618687395.0, + "step": 1291 + }, + { + "epoch": 0.7667655786350148, + "grad_norm": 0.5599428415298462, + "learning_rate": 1e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7671019434928894, + "num_tokens": 619176539.0, + "step": 1292 + }, + { + "epoch": 0.7673590504451039, + "grad_norm": 0.5595715045928955, + "learning_rate": 1e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7508096098899841, + "num_tokens": 619621160.0, + "step": 1293 + }, + { + "epoch": 0.7679525222551928, + "grad_norm": 0.568273663520813, + "learning_rate": 1e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7613792419433594, + "num_tokens": 620083732.0, + "step": 1294 + }, + { + "epoch": 0.7685459940652819, + "grad_norm": 0.5145353078842163, + "learning_rate": 1e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7763546705245972, + "num_tokens": 620568127.0, + "step": 1295 + }, + { + "epoch": 0.769139465875371, + "grad_norm": 0.5281001329421997, + "learning_rate": 1e-06, + "loss": 0.82, + "mean_token_accuracy": 0.748423159122467, + "num_tokens": 621050564.0, + "step": 1296 + }, + { + "epoch": 0.7697329376854599, + "grad_norm": 0.5190526247024536, + "learning_rate": 1e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7627578973770142, + "num_tokens": 621558689.0, + "step": 1297 + }, + { + "epoch": 0.770326409495549, + "grad_norm": 0.5299444794654846, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7703335285186768, + "num_tokens": 622044753.0, + "step": 1298 + }, + { + "epoch": 0.7709198813056379, + "grad_norm": 0.5251760482788086, + "learning_rate": 1e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7577390074729919, + "num_tokens": 622532445.0, + "step": 1299 + }, + { + "epoch": 0.771513353115727, + "grad_norm": 0.5225978493690491, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7572876214981079, + "num_tokens": 623016038.0, + "step": 1300 + }, + { + "epoch": 0.7721068249258161, + "grad_norm": 0.5373939275741577, + "learning_rate": 1e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7586191892623901, + "num_tokens": 623461644.0, + "step": 1301 + }, + { + "epoch": 0.772700296735905, + "grad_norm": 0.515791118144989, + "learning_rate": 1e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7809798121452332, + "num_tokens": 623949983.0, + "step": 1302 + }, + { + "epoch": 0.7732937685459941, + "grad_norm": 0.5468103885650635, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7692770957946777, + "num_tokens": 624418288.0, + "step": 1303 + }, + { + "epoch": 0.7738872403560831, + "grad_norm": 0.5417343378067017, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7622655630111694, + "num_tokens": 624877565.0, + "step": 1304 + }, + { + "epoch": 0.7744807121661721, + "grad_norm": 0.5489686131477356, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7474817037582397, + "num_tokens": 625339125.0, + "step": 1305 + }, + { + "epoch": 0.7750741839762612, + "grad_norm": 0.5460302233695984, + "learning_rate": 1e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7568994760513306, + "num_tokens": 625828950.0, + "step": 1306 + }, + { + "epoch": 0.7756676557863501, + "grad_norm": 0.6243822574615479, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7729068398475647, + "num_tokens": 626324447.0, + "step": 1307 + }, + { + "epoch": 0.7762611275964392, + "grad_norm": 0.494763046503067, + "learning_rate": 1e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7518100738525391, + "num_tokens": 626854159.0, + "step": 1308 + }, + { + "epoch": 0.7768545994065282, + "grad_norm": 0.50877445936203, + "learning_rate": 1e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7739376425743103, + "num_tokens": 627364215.0, + "step": 1309 + }, + { + "epoch": 0.7774480712166172, + "grad_norm": 0.5195890665054321, + "learning_rate": 1e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.7626669406890869, + "num_tokens": 627876776.0, + "step": 1310 + }, + { + "epoch": 0.7780415430267063, + "grad_norm": 0.5398515462875366, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7626506686210632, + "num_tokens": 628388647.0, + "step": 1311 + }, + { + "epoch": 0.7786350148367952, + "grad_norm": 0.547382652759552, + "learning_rate": 1e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.7671017050743103, + "num_tokens": 628860708.0, + "step": 1312 + }, + { + "epoch": 0.7792284866468843, + "grad_norm": 0.5495597124099731, + "learning_rate": 1e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.7531055212020874, + "num_tokens": 629309158.0, + "step": 1313 + }, + { + "epoch": 0.7798219584569733, + "grad_norm": 0.5803837776184082, + "learning_rate": 1e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.758162796497345, + "num_tokens": 629813111.0, + "step": 1314 + }, + { + "epoch": 0.7804154302670623, + "grad_norm": 0.57581627368927, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7553257942199707, + "num_tokens": 630281372.0, + "step": 1315 + }, + { + "epoch": 0.7810089020771513, + "grad_norm": 0.5204533338546753, + "learning_rate": 1e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7610307335853577, + "num_tokens": 630764213.0, + "step": 1316 + }, + { + "epoch": 0.7816023738872404, + "grad_norm": 0.5452558398246765, + "learning_rate": 1e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.753769040107727, + "num_tokens": 631275158.0, + "step": 1317 + }, + { + "epoch": 0.7821958456973294, + "grad_norm": 0.5835568308830261, + "learning_rate": 1e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.7601416707038879, + "num_tokens": 631734298.0, + "step": 1318 + }, + { + "epoch": 0.7827893175074184, + "grad_norm": 0.554096519947052, + "learning_rate": 1e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7410178184509277, + "num_tokens": 632230060.0, + "step": 1319 + }, + { + "epoch": 0.7833827893175074, + "grad_norm": 0.5092306137084961, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7436723709106445, + "num_tokens": 632731442.0, + "step": 1320 + }, + { + "epoch": 0.7839762611275964, + "grad_norm": 0.5459077954292297, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7455660104751587, + "num_tokens": 633228800.0, + "step": 1321 + }, + { + "epoch": 0.7845697329376855, + "grad_norm": 0.5655431747436523, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7588773965835571, + "num_tokens": 633725664.0, + "step": 1322 + }, + { + "epoch": 0.7851632047477745, + "grad_norm": 0.5434131026268005, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.745990514755249, + "num_tokens": 634251744.0, + "step": 1323 + }, + { + "epoch": 0.7857566765578635, + "grad_norm": 0.5580204725265503, + "learning_rate": 1e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7665289640426636, + "num_tokens": 634670806.0, + "step": 1324 + }, + { + "epoch": 0.7863501483679525, + "grad_norm": 0.535300612449646, + "learning_rate": 1e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7607446908950806, + "num_tokens": 635157407.0, + "step": 1325 + }, + { + "epoch": 0.7869436201780415, + "grad_norm": 0.5734478831291199, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7591487169265747, + "num_tokens": 635631392.0, + "step": 1326 + }, + { + "epoch": 0.7875370919881306, + "grad_norm": 0.5614514350891113, + "learning_rate": 1e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7603999376296997, + "num_tokens": 636107635.0, + "step": 1327 + }, + { + "epoch": 0.7881305637982196, + "grad_norm": 0.5360249876976013, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7592893242835999, + "num_tokens": 636588929.0, + "step": 1328 + }, + { + "epoch": 0.7887240356083086, + "grad_norm": 0.5250363349914551, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7593998908996582, + "num_tokens": 637095307.0, + "step": 1329 + }, + { + "epoch": 0.7893175074183977, + "grad_norm": 0.5383617281913757, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7470459342002869, + "num_tokens": 637591432.0, + "step": 1330 + }, + { + "epoch": 0.7899109792284866, + "grad_norm": 0.525212287902832, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7681478261947632, + "num_tokens": 638080359.0, + "step": 1331 + }, + { + "epoch": 0.7905044510385757, + "grad_norm": 0.5618284344673157, + "learning_rate": 1e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7421983480453491, + "num_tokens": 638582068.0, + "step": 1332 + }, + { + "epoch": 0.7910979228486646, + "grad_norm": 0.5381459593772888, + "learning_rate": 1e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7470153570175171, + "num_tokens": 639087753.0, + "step": 1333 + }, + { + "epoch": 0.7916913946587537, + "grad_norm": 0.5677259564399719, + "learning_rate": 1e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7516691088676453, + "num_tokens": 639551319.0, + "step": 1334 + }, + { + "epoch": 0.7922848664688428, + "grad_norm": 0.5460424423217773, + "learning_rate": 1e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7591023445129395, + "num_tokens": 640014040.0, + "step": 1335 + }, + { + "epoch": 0.7928783382789317, + "grad_norm": 0.536196231842041, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7595245838165283, + "num_tokens": 640509895.0, + "step": 1336 + }, + { + "epoch": 0.7934718100890208, + "grad_norm": 0.5733128190040588, + "learning_rate": 1e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7745388746261597, + "num_tokens": 640985532.0, + "step": 1337 + }, + { + "epoch": 0.7940652818991097, + "grad_norm": 0.5592982172966003, + "learning_rate": 1e-06, + "loss": 0.774, + "mean_token_accuracy": 0.7572548389434814, + "num_tokens": 641440559.0, + "step": 1338 + }, + { + "epoch": 0.7946587537091988, + "grad_norm": 0.5408653616905212, + "learning_rate": 1e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.752709150314331, + "num_tokens": 641909492.0, + "step": 1339 + }, + { + "epoch": 0.7952522255192879, + "grad_norm": 0.5659368634223938, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7629530429840088, + "num_tokens": 642372625.0, + "step": 1340 + }, + { + "epoch": 0.7958456973293768, + "grad_norm": 0.5684881806373596, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.762815535068512, + "num_tokens": 642882280.0, + "step": 1341 + }, + { + "epoch": 0.7964391691394659, + "grad_norm": 0.5332571268081665, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7620574235916138, + "num_tokens": 643361737.0, + "step": 1342 + }, + { + "epoch": 0.797032640949555, + "grad_norm": 0.5446078181266785, + "learning_rate": 1e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7561990022659302, + "num_tokens": 643860555.0, + "step": 1343 + }, + { + "epoch": 0.7976261127596439, + "grad_norm": 0.5385493636131287, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7673972249031067, + "num_tokens": 644365247.0, + "step": 1344 + }, + { + "epoch": 0.798219584569733, + "grad_norm": 0.5184741616249084, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7595222592353821, + "num_tokens": 644872548.0, + "step": 1345 + }, + { + "epoch": 0.7988130563798219, + "grad_norm": 0.5299637317657471, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7583218812942505, + "num_tokens": 645342415.0, + "step": 1346 + }, + { + "epoch": 0.799406528189911, + "grad_norm": 0.531974732875824, + "learning_rate": 1e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7577105164527893, + "num_tokens": 645818707.0, + "step": 1347 + }, + { + "epoch": 0.8, + "grad_norm": 0.5258042812347412, + "learning_rate": 1e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7558083534240723, + "num_tokens": 646321803.0, + "step": 1348 + }, + { + "epoch": 0.800593471810089, + "grad_norm": 0.5472694635391235, + "learning_rate": 1e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.76612389087677, + "num_tokens": 646814480.0, + "step": 1349 + }, + { + "epoch": 0.8011869436201781, + "grad_norm": 0.5275675654411316, + "learning_rate": 1e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7702908515930176, + "num_tokens": 647279003.0, + "step": 1350 + }, + { + "epoch": 0.801780415430267, + "grad_norm": 0.5531714558601379, + "learning_rate": 1e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.755946695804596, + "num_tokens": 647765180.0, + "step": 1351 + }, + { + "epoch": 0.8023738872403561, + "grad_norm": 0.5445864200592041, + "learning_rate": 1e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7652202844619751, + "num_tokens": 648261260.0, + "step": 1352 + }, + { + "epoch": 0.8029673590504451, + "grad_norm": 0.5737276673316956, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7555241584777832, + "num_tokens": 648694014.0, + "step": 1353 + }, + { + "epoch": 0.8035608308605341, + "grad_norm": 0.5389559268951416, + "learning_rate": 1e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7578667402267456, + "num_tokens": 649156282.0, + "step": 1354 + }, + { + "epoch": 0.8041543026706232, + "grad_norm": 0.519045889377594, + "learning_rate": 1e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7642953395843506, + "num_tokens": 649644813.0, + "step": 1355 + }, + { + "epoch": 0.8047477744807122, + "grad_norm": 0.5585653781890869, + "learning_rate": 1e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7557286024093628, + "num_tokens": 650110660.0, + "step": 1356 + }, + { + "epoch": 0.8053412462908012, + "grad_norm": 0.5742524862289429, + "learning_rate": 1e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7656286358833313, + "num_tokens": 650576406.0, + "step": 1357 + }, + { + "epoch": 0.8059347181008902, + "grad_norm": 0.548732340335846, + "learning_rate": 1e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7533860206604004, + "num_tokens": 651033363.0, + "step": 1358 + }, + { + "epoch": 0.8065281899109792, + "grad_norm": 0.5561477541923523, + "learning_rate": 1e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7558311820030212, + "num_tokens": 651499573.0, + "step": 1359 + }, + { + "epoch": 0.8071216617210683, + "grad_norm": 0.528300940990448, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7515662908554077, + "num_tokens": 651996204.0, + "step": 1360 + }, + { + "epoch": 0.8077151335311573, + "grad_norm": 0.5819301605224609, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7419940233230591, + "num_tokens": 652429796.0, + "step": 1361 + }, + { + "epoch": 0.8083086053412463, + "grad_norm": 0.5448263883590698, + "learning_rate": 1e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7549907565116882, + "num_tokens": 652896655.0, + "step": 1362 + }, + { + "epoch": 0.8089020771513353, + "grad_norm": 0.520682156085968, + "learning_rate": 1e-06, + "loss": 0.7319, + "mean_token_accuracy": 0.7711506485939026, + "num_tokens": 653400486.0, + "step": 1363 + }, + { + "epoch": 0.8094955489614243, + "grad_norm": 0.5489243865013123, + "learning_rate": 1e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.770624041557312, + "num_tokens": 653856894.0, + "step": 1364 + }, + { + "epoch": 0.8100890207715133, + "grad_norm": 0.537106454372406, + "learning_rate": 1e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7578310370445251, + "num_tokens": 654326946.0, + "step": 1365 + }, + { + "epoch": 0.8106824925816024, + "grad_norm": 0.5512174367904663, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7390304207801819, + "num_tokens": 654812859.0, + "step": 1366 + }, + { + "epoch": 0.8112759643916914, + "grad_norm": 0.5465667843818665, + "learning_rate": 1e-06, + "loss": 0.7428, + "mean_token_accuracy": 0.7648171186447144, + "num_tokens": 655287674.0, + "step": 1367 + }, + { + "epoch": 0.8118694362017804, + "grad_norm": 0.5221254229545593, + "learning_rate": 1e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7631638050079346, + "num_tokens": 655807552.0, + "step": 1368 + }, + { + "epoch": 0.8124629080118695, + "grad_norm": 0.5375224947929382, + "learning_rate": 1e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.751593828201294, + "num_tokens": 656303891.0, + "step": 1369 + }, + { + "epoch": 0.8130563798219584, + "grad_norm": 0.5422762632369995, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7663414478302002, + "num_tokens": 656748143.0, + "step": 1370 + }, + { + "epoch": 0.8136498516320475, + "grad_norm": 0.583605170249939, + "learning_rate": 1e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7524368166923523, + "num_tokens": 657175598.0, + "step": 1371 + }, + { + "epoch": 0.8142433234421365, + "grad_norm": 0.5495569109916687, + "learning_rate": 1e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7637591361999512, + "num_tokens": 657657047.0, + "step": 1372 + }, + { + "epoch": 0.8148367952522255, + "grad_norm": 0.5285443067550659, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7737729549407959, + "num_tokens": 658169330.0, + "step": 1373 + }, + { + "epoch": 0.8154302670623146, + "grad_norm": 0.5541511178016663, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.764283299446106, + "num_tokens": 658625178.0, + "step": 1374 + }, + { + "epoch": 0.8160237388724035, + "grad_norm": 0.5500236749649048, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7637395858764648, + "num_tokens": 659116398.0, + "step": 1375 + }, + { + "epoch": 0.8166172106824926, + "grad_norm": 0.527463972568512, + "learning_rate": 1e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7843306064605713, + "num_tokens": 659620919.0, + "step": 1376 + }, + { + "epoch": 0.8172106824925816, + "grad_norm": 0.5425734519958496, + "learning_rate": 1e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7645263671875, + "num_tokens": 660110950.0, + "step": 1377 + }, + { + "epoch": 0.8178041543026706, + "grad_norm": 0.5441709756851196, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7631016969680786, + "num_tokens": 660587569.0, + "step": 1378 + }, + { + "epoch": 0.8183976261127597, + "grad_norm": 0.5262066125869751, + "learning_rate": 1e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.767975926399231, + "num_tokens": 661071259.0, + "step": 1379 + }, + { + "epoch": 0.8189910979228486, + "grad_norm": 0.5472378134727478, + "learning_rate": 1e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7757098078727722, + "num_tokens": 661529386.0, + "step": 1380 + }, + { + "epoch": 0.8195845697329377, + "grad_norm": 0.5701904892921448, + "learning_rate": 1e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7560179233551025, + "num_tokens": 662021336.0, + "step": 1381 + }, + { + "epoch": 0.8201780415430268, + "grad_norm": 0.5624939799308777, + "learning_rate": 1e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7713282108306885, + "num_tokens": 662475149.0, + "step": 1382 + }, + { + "epoch": 0.8207715133531157, + "grad_norm": 0.5261802077293396, + "learning_rate": 1e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7619401216506958, + "num_tokens": 662961833.0, + "step": 1383 + }, + { + "epoch": 0.8213649851632048, + "grad_norm": 0.5893220901489258, + "learning_rate": 1e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7521143555641174, + "num_tokens": 663396875.0, + "step": 1384 + }, + { + "epoch": 0.8219584569732937, + "grad_norm": 0.5559762716293335, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7599813938140869, + "num_tokens": 663870404.0, + "step": 1385 + }, + { + "epoch": 0.8225519287833828, + "grad_norm": 0.564203679561615, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7528002858161926, + "num_tokens": 664325084.0, + "step": 1386 + }, + { + "epoch": 0.8231454005934719, + "grad_norm": 0.5478346347808838, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7627475261688232, + "num_tokens": 664840723.0, + "step": 1387 + }, + { + "epoch": 0.8237388724035608, + "grad_norm": 0.5695214867591858, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7606298923492432, + "num_tokens": 665296879.0, + "step": 1388 + }, + { + "epoch": 0.8243323442136499, + "grad_norm": 0.5380672812461853, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7515439987182617, + "num_tokens": 665817699.0, + "step": 1389 + }, + { + "epoch": 0.8249258160237388, + "grad_norm": 0.5287579894065857, + "learning_rate": 1e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7593762874603271, + "num_tokens": 666308336.0, + "step": 1390 + }, + { + "epoch": 0.8255192878338279, + "grad_norm": 0.5708578824996948, + "learning_rate": 1e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7563409805297852, + "num_tokens": 666755552.0, + "step": 1391 + }, + { + "epoch": 0.826112759643917, + "grad_norm": 0.5356185436248779, + "learning_rate": 1e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7722305059432983, + "num_tokens": 667250098.0, + "step": 1392 + }, + { + "epoch": 0.8267062314540059, + "grad_norm": 0.5733942985534668, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7568099498748779, + "num_tokens": 667690948.0, + "step": 1393 + }, + { + "epoch": 0.827299703264095, + "grad_norm": 0.5654734373092651, + "learning_rate": 1e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7712242603302002, + "num_tokens": 668188933.0, + "step": 1394 + }, + { + "epoch": 0.827893175074184, + "grad_norm": 0.5476788282394409, + "learning_rate": 1e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7519007325172424, + "num_tokens": 668673454.0, + "step": 1395 + }, + { + "epoch": 0.828486646884273, + "grad_norm": 0.5597893595695496, + "learning_rate": 1e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7615355849266052, + "num_tokens": 669157149.0, + "step": 1396 + }, + { + "epoch": 0.829080118694362, + "grad_norm": 0.581238329410553, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7500553131103516, + "num_tokens": 669627897.0, + "step": 1397 + }, + { + "epoch": 0.829673590504451, + "grad_norm": 0.5410245060920715, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7646617293357849, + "num_tokens": 670090439.0, + "step": 1398 + }, + { + "epoch": 0.8302670623145401, + "grad_norm": 0.5351824164390564, + "learning_rate": 1e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7553079128265381, + "num_tokens": 670573699.0, + "step": 1399 + }, + { + "epoch": 0.8308605341246291, + "grad_norm": 0.5452401638031006, + "learning_rate": 1e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7486651539802551, + "num_tokens": 671103052.0, + "step": 1400 + }, + { + "epoch": 0.8314540059347181, + "grad_norm": 0.5465162992477417, + "learning_rate": 1e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7517284154891968, + "num_tokens": 671591454.0, + "step": 1401 + }, + { + "epoch": 0.8320474777448071, + "grad_norm": 0.5473088026046753, + "learning_rate": 1e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.752689003944397, + "num_tokens": 672078410.0, + "step": 1402 + }, + { + "epoch": 0.8326409495548961, + "grad_norm": 0.5364429950714111, + "learning_rate": 1e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.7703477144241333, + "num_tokens": 672551425.0, + "step": 1403 + }, + { + "epoch": 0.8332344213649852, + "grad_norm": 0.566776692867279, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7597219944000244, + "num_tokens": 672989689.0, + "step": 1404 + }, + { + "epoch": 0.8338278931750742, + "grad_norm": 0.588154137134552, + "learning_rate": 1e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7624976634979248, + "num_tokens": 673445768.0, + "step": 1405 + }, + { + "epoch": 0.8344213649851632, + "grad_norm": 0.5086617469787598, + "learning_rate": 1e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7652546167373657, + "num_tokens": 673995411.0, + "step": 1406 + }, + { + "epoch": 0.8350148367952522, + "grad_norm": 0.5283513069152832, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7559911012649536, + "num_tokens": 674486543.0, + "step": 1407 + }, + { + "epoch": 0.8356083086053413, + "grad_norm": 0.5194128155708313, + "learning_rate": 1e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.756543755531311, + "num_tokens": 674975047.0, + "step": 1408 + }, + { + "epoch": 0.8362017804154303, + "grad_norm": 0.558293342590332, + "learning_rate": 1e-06, + "loss": 0.7449, + "mean_token_accuracy": 0.7653253674507141, + "num_tokens": 675419452.0, + "step": 1409 + }, + { + "epoch": 0.8367952522255193, + "grad_norm": 0.5839561820030212, + "learning_rate": 1e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7590888142585754, + "num_tokens": 675875192.0, + "step": 1410 + }, + { + "epoch": 0.8373887240356083, + "grad_norm": 0.5592178106307983, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7467697858810425, + "num_tokens": 676339788.0, + "step": 1411 + }, + { + "epoch": 0.8379821958456973, + "grad_norm": 0.5443110466003418, + "learning_rate": 1e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7654780149459839, + "num_tokens": 676845321.0, + "step": 1412 + }, + { + "epoch": 0.8385756676557864, + "grad_norm": 0.5137251615524292, + "learning_rate": 1e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7702818512916565, + "num_tokens": 677342091.0, + "step": 1413 + }, + { + "epoch": 0.8391691394658753, + "grad_norm": 0.5624068975448608, + "learning_rate": 1e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7635776996612549, + "num_tokens": 677807297.0, + "step": 1414 + }, + { + "epoch": 0.8397626112759644, + "grad_norm": 0.5253633260726929, + "learning_rate": 1e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7608311176300049, + "num_tokens": 678303289.0, + "step": 1415 + }, + { + "epoch": 0.8403560830860534, + "grad_norm": 0.5644158124923706, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7666764259338379, + "num_tokens": 678806848.0, + "step": 1416 + }, + { + "epoch": 0.8409495548961424, + "grad_norm": 0.5574365258216858, + "learning_rate": 1e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.755070149898529, + "num_tokens": 679277749.0, + "step": 1417 + }, + { + "epoch": 0.8415430267062315, + "grad_norm": 0.5157427191734314, + "learning_rate": 1e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7575739622116089, + "num_tokens": 679792693.0, + "step": 1418 + }, + { + "epoch": 0.8421364985163204, + "grad_norm": 0.5526737570762634, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7396101951599121, + "num_tokens": 680281105.0, + "step": 1419 + }, + { + "epoch": 0.8427299703264095, + "grad_norm": 0.532640278339386, + "learning_rate": 1e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.766266942024231, + "num_tokens": 680806969.0, + "step": 1420 + }, + { + "epoch": 0.8433234421364986, + "grad_norm": 0.5253825187683105, + "learning_rate": 1e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.7575101852416992, + "num_tokens": 681308846.0, + "step": 1421 + }, + { + "epoch": 0.8439169139465875, + "grad_norm": 0.5461105108261108, + "learning_rate": 1e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7663804292678833, + "num_tokens": 681751502.0, + "step": 1422 + }, + { + "epoch": 0.8445103857566766, + "grad_norm": 0.5557116270065308, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7446010112762451, + "num_tokens": 682239447.0, + "step": 1423 + }, + { + "epoch": 0.8451038575667655, + "grad_norm": 0.5314924120903015, + "learning_rate": 1e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7524414658546448, + "num_tokens": 682728885.0, + "step": 1424 + }, + { + "epoch": 0.8456973293768546, + "grad_norm": 0.5488322973251343, + "learning_rate": 1e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7589896321296692, + "num_tokens": 683237492.0, + "step": 1425 + }, + { + "epoch": 0.8462908011869437, + "grad_norm": 0.5398913621902466, + "learning_rate": 1e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7677364945411682, + "num_tokens": 683697840.0, + "step": 1426 + }, + { + "epoch": 0.8468842729970326, + "grad_norm": 0.5316124558448792, + "learning_rate": 1e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7709294557571411, + "num_tokens": 684196203.0, + "step": 1427 + }, + { + "epoch": 0.8474777448071217, + "grad_norm": 0.5459354519844055, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7739368677139282, + "num_tokens": 684657201.0, + "step": 1428 + }, + { + "epoch": 0.8480712166172106, + "grad_norm": 0.5285589098930359, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7737137079238892, + "num_tokens": 685111454.0, + "step": 1429 + }, + { + "epoch": 0.8486646884272997, + "grad_norm": 0.5879486799240112, + "learning_rate": 1e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7559103965759277, + "num_tokens": 685551820.0, + "step": 1430 + }, + { + "epoch": 0.8492581602373888, + "grad_norm": 0.5500279068946838, + "learning_rate": 1e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7585258483886719, + "num_tokens": 686071498.0, + "step": 1431 + }, + { + "epoch": 0.8498516320474777, + "grad_norm": 0.5226611495018005, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7654927968978882, + "num_tokens": 686560889.0, + "step": 1432 + }, + { + "epoch": 0.8504451038575668, + "grad_norm": 0.5160412192344666, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7596844434738159, + "num_tokens": 687063198.0, + "step": 1433 + }, + { + "epoch": 0.8510385756676558, + "grad_norm": 0.5393741726875305, + "learning_rate": 1e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7561277151107788, + "num_tokens": 687574058.0, + "step": 1434 + }, + { + "epoch": 0.8516320474777448, + "grad_norm": 0.5173771381378174, + "learning_rate": 1e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7573050856590271, + "num_tokens": 688073095.0, + "step": 1435 + }, + { + "epoch": 0.8522255192878339, + "grad_norm": 0.5480217337608337, + "learning_rate": 1e-06, + "loss": 0.7621, + "mean_token_accuracy": 0.7611591815948486, + "num_tokens": 688588017.0, + "step": 1436 + }, + { + "epoch": 0.8528189910979228, + "grad_norm": 0.5166006684303284, + "learning_rate": 1e-06, + "loss": 0.7547, + "mean_token_accuracy": 0.7622236013412476, + "num_tokens": 689086290.0, + "step": 1437 + }, + { + "epoch": 0.8534124629080119, + "grad_norm": 0.5286623239517212, + "learning_rate": 1e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7761198282241821, + "num_tokens": 689583978.0, + "step": 1438 + }, + { + "epoch": 0.8540059347181009, + "grad_norm": 0.5310486555099487, + "learning_rate": 1e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7654868364334106, + "num_tokens": 690068231.0, + "step": 1439 + }, + { + "epoch": 0.8545994065281899, + "grad_norm": 0.5984750390052795, + "learning_rate": 1e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.7505038976669312, + "num_tokens": 690480719.0, + "step": 1440 + }, + { + "epoch": 0.855192878338279, + "grad_norm": 0.5765839219093323, + "learning_rate": 1e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7608562707901001, + "num_tokens": 690951542.0, + "step": 1441 + }, + { + "epoch": 0.8557863501483679, + "grad_norm": 0.543803334236145, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7585281133651733, + "num_tokens": 691471280.0, + "step": 1442 + }, + { + "epoch": 0.856379821958457, + "grad_norm": 0.5194708108901978, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7670482397079468, + "num_tokens": 691979779.0, + "step": 1443 + }, + { + "epoch": 0.856973293768546, + "grad_norm": 0.5411669015884399, + "learning_rate": 1e-06, + "loss": 0.774, + "mean_token_accuracy": 0.7591109871864319, + "num_tokens": 692449130.0, + "step": 1444 + }, + { + "epoch": 0.857566765578635, + "grad_norm": 0.5673848986625671, + "learning_rate": 1e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.765324592590332, + "num_tokens": 692936492.0, + "step": 1445 + }, + { + "epoch": 0.858160237388724, + "grad_norm": 0.5337621569633484, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7665202617645264, + "num_tokens": 693430175.0, + "step": 1446 + }, + { + "epoch": 0.8587537091988131, + "grad_norm": 0.5362903475761414, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7569166421890259, + "num_tokens": 693908832.0, + "step": 1447 + }, + { + "epoch": 0.8593471810089021, + "grad_norm": 0.5387417078018188, + "learning_rate": 1e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7556157112121582, + "num_tokens": 694418055.0, + "step": 1448 + }, + { + "epoch": 0.8599406528189911, + "grad_norm": 0.5617052316665649, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.764226496219635, + "num_tokens": 694849293.0, + "step": 1449 + }, + { + "epoch": 0.8605341246290801, + "grad_norm": 0.5300355553627014, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7638602256774902, + "num_tokens": 695340760.0, + "step": 1450 + }, + { + "epoch": 0.8611275964391691, + "grad_norm": 0.5509142279624939, + "learning_rate": 1e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7631222009658813, + "num_tokens": 695815231.0, + "step": 1451 + }, + { + "epoch": 0.8617210682492582, + "grad_norm": 0.5699421763420105, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7710981965065002, + "num_tokens": 696314347.0, + "step": 1452 + }, + { + "epoch": 0.8623145400593472, + "grad_norm": 0.5889654159545898, + "learning_rate": 1e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7516361474990845, + "num_tokens": 696764600.0, + "step": 1453 + }, + { + "epoch": 0.8629080118694362, + "grad_norm": 0.5583620667457581, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.752425491809845, + "num_tokens": 697223329.0, + "step": 1454 + }, + { + "epoch": 0.8635014836795252, + "grad_norm": 0.5206276178359985, + "learning_rate": 1e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.7690086960792542, + "num_tokens": 697730066.0, + "step": 1455 + }, + { + "epoch": 0.8640949554896142, + "grad_norm": 0.5526474714279175, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7601112723350525, + "num_tokens": 698238168.0, + "step": 1456 + }, + { + "epoch": 0.8646884272997033, + "grad_norm": 0.5302124619483948, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7462564706802368, + "num_tokens": 698719137.0, + "step": 1457 + }, + { + "epoch": 0.8652818991097923, + "grad_norm": 0.5440181493759155, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7584669589996338, + "num_tokens": 699212234.0, + "step": 1458 + }, + { + "epoch": 0.8658753709198813, + "grad_norm": 0.5389121174812317, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7543368339538574, + "num_tokens": 699719808.0, + "step": 1459 + }, + { + "epoch": 0.8664688427299704, + "grad_norm": 0.5408107042312622, + "learning_rate": 1e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7741724848747253, + "num_tokens": 700168855.0, + "step": 1460 + }, + { + "epoch": 0.8670623145400593, + "grad_norm": 0.5397465825080872, + "learning_rate": 1e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7668783664703369, + "num_tokens": 700665700.0, + "step": 1461 + }, + { + "epoch": 0.8676557863501484, + "grad_norm": 0.5291056632995605, + "learning_rate": 1e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.7546048164367676, + "num_tokens": 701165665.0, + "step": 1462 + }, + { + "epoch": 0.8682492581602373, + "grad_norm": 0.5444239377975464, + "learning_rate": 1e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.760552704334259, + "num_tokens": 701645068.0, + "step": 1463 + }, + { + "epoch": 0.8688427299703264, + "grad_norm": 0.5329621434211731, + "learning_rate": 1e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7590093016624451, + "num_tokens": 702157308.0, + "step": 1464 + }, + { + "epoch": 0.8694362017804155, + "grad_norm": 0.5365237593650818, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7652657628059387, + "num_tokens": 702607677.0, + "step": 1465 + }, + { + "epoch": 0.8700296735905044, + "grad_norm": 0.5258664488792419, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7414788603782654, + "num_tokens": 703106260.0, + "step": 1466 + }, + { + "epoch": 0.8706231454005935, + "grad_norm": 0.5627859830856323, + "learning_rate": 1e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7544267177581787, + "num_tokens": 703585014.0, + "step": 1467 + }, + { + "epoch": 0.8712166172106824, + "grad_norm": 0.5769330263137817, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.745551347732544, + "num_tokens": 704047068.0, + "step": 1468 + }, + { + "epoch": 0.8718100890207715, + "grad_norm": 0.5497450828552246, + "learning_rate": 1e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7607512474060059, + "num_tokens": 704493944.0, + "step": 1469 + }, + { + "epoch": 0.8724035608308606, + "grad_norm": 0.5395509004592896, + "learning_rate": 1e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7511622905731201, + "num_tokens": 705001200.0, + "step": 1470 + }, + { + "epoch": 0.8729970326409495, + "grad_norm": 0.5300896763801575, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7618743777275085, + "num_tokens": 705489138.0, + "step": 1471 + }, + { + "epoch": 0.8735905044510386, + "grad_norm": 0.5316885113716125, + "learning_rate": 1e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.758003294467926, + "num_tokens": 706000546.0, + "step": 1472 + }, + { + "epoch": 0.8741839762611276, + "grad_norm": 0.5116707682609558, + "learning_rate": 1e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7530294060707092, + "num_tokens": 706539002.0, + "step": 1473 + }, + { + "epoch": 0.8747774480712166, + "grad_norm": 0.5805963277816772, + "learning_rate": 1e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7502538561820984, + "num_tokens": 706965818.0, + "step": 1474 + }, + { + "epoch": 0.8753709198813057, + "grad_norm": 0.5558772683143616, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7596176862716675, + "num_tokens": 707432631.0, + "step": 1475 + }, + { + "epoch": 0.8759643916913946, + "grad_norm": 0.5629997253417969, + "learning_rate": 1e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.758453369140625, + "num_tokens": 707888328.0, + "step": 1476 + }, + { + "epoch": 0.8765578635014837, + "grad_norm": 0.54859459400177, + "learning_rate": 1e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7753168344497681, + "num_tokens": 708358026.0, + "step": 1477 + }, + { + "epoch": 0.8771513353115727, + "grad_norm": 0.6058205366134644, + "learning_rate": 1e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7489645481109619, + "num_tokens": 708828245.0, + "step": 1478 + }, + { + "epoch": 0.8777448071216617, + "grad_norm": 0.5676120519638062, + "learning_rate": 1e-06, + "loss": 0.7582, + "mean_token_accuracy": 0.7595248818397522, + "num_tokens": 709285119.0, + "step": 1479 + }, + { + "epoch": 0.8783382789317508, + "grad_norm": 0.5324345231056213, + "learning_rate": 1e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7487689256668091, + "num_tokens": 709808664.0, + "step": 1480 + }, + { + "epoch": 0.8789317507418397, + "grad_norm": 0.6674978733062744, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7500385046005249, + "num_tokens": 710228109.0, + "step": 1481 + }, + { + "epoch": 0.8795252225519288, + "grad_norm": 0.5950703620910645, + "learning_rate": 1e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7611380219459534, + "num_tokens": 710670804.0, + "step": 1482 + }, + { + "epoch": 0.8801186943620178, + "grad_norm": 0.5673949122428894, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7457025647163391, + "num_tokens": 711147582.0, + "step": 1483 + }, + { + "epoch": 0.8807121661721068, + "grad_norm": 0.5540648698806763, + "learning_rate": 1e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7517971992492676, + "num_tokens": 711613849.0, + "step": 1484 + }, + { + "epoch": 0.8813056379821959, + "grad_norm": 0.5333675742149353, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7460842728614807, + "num_tokens": 712094540.0, + "step": 1485 + }, + { + "epoch": 0.8818991097922849, + "grad_norm": 0.5335438251495361, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7725037336349487, + "num_tokens": 712528158.0, + "step": 1486 + }, + { + "epoch": 0.8824925816023739, + "grad_norm": 0.5836489200592041, + "learning_rate": 1e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7506896257400513, + "num_tokens": 712981549.0, + "step": 1487 + }, + { + "epoch": 0.8830860534124629, + "grad_norm": 0.5402339100837708, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7593767642974854, + "num_tokens": 713471257.0, + "step": 1488 + }, + { + "epoch": 0.8836795252225519, + "grad_norm": 0.5020433664321899, + "learning_rate": 1e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.756501317024231, + "num_tokens": 713970881.0, + "step": 1489 + }, + { + "epoch": 0.884272997032641, + "grad_norm": 0.5250263214111328, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7444160580635071, + "num_tokens": 714475578.0, + "step": 1490 + }, + { + "epoch": 0.88486646884273, + "grad_norm": 0.5692324638366699, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7620409727096558, + "num_tokens": 714945792.0, + "step": 1491 + }, + { + "epoch": 0.885459940652819, + "grad_norm": 0.5438718199729919, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7490589618682861, + "num_tokens": 715426671.0, + "step": 1492 + }, + { + "epoch": 0.886053412462908, + "grad_norm": 0.5798665285110474, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7678227424621582, + "num_tokens": 715900575.0, + "step": 1493 + }, + { + "epoch": 0.886646884272997, + "grad_norm": 0.5745545029640198, + "learning_rate": 1e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7645676136016846, + "num_tokens": 716322482.0, + "step": 1494 + }, + { + "epoch": 0.887240356083086, + "grad_norm": 0.5693415403366089, + "learning_rate": 1e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7684246301651001, + "num_tokens": 716795378.0, + "step": 1495 + }, + { + "epoch": 0.8878338278931751, + "grad_norm": 0.5327877402305603, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7638565897941589, + "num_tokens": 717290918.0, + "step": 1496 + }, + { + "epoch": 0.8884272997032641, + "grad_norm": 0.5680555701255798, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7629086971282959, + "num_tokens": 717738507.0, + "step": 1497 + }, + { + "epoch": 0.8890207715133531, + "grad_norm": 0.6458418965339661, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7564756870269775, + "num_tokens": 718173972.0, + "step": 1498 + }, + { + "epoch": 0.8896142433234422, + "grad_norm": 0.533676028251648, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7652195692062378, + "num_tokens": 718674279.0, + "step": 1499 + }, + { + "epoch": 0.8902077151335311, + "grad_norm": 0.5334179997444153, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7570838332176208, + "num_tokens": 719172931.0, + "step": 1500 + }, + { + "epoch": 0.8908011869436202, + "grad_norm": 0.5331872701644897, + "learning_rate": 1e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.7535638809204102, + "num_tokens": 719654233.0, + "step": 1501 + }, + { + "epoch": 0.8913946587537092, + "grad_norm": 0.5358446836471558, + "learning_rate": 1e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7647688388824463, + "num_tokens": 720130681.0, + "step": 1502 + }, + { + "epoch": 0.8919881305637982, + "grad_norm": 0.5587506890296936, + "learning_rate": 1e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7481672167778015, + "num_tokens": 720584918.0, + "step": 1503 + }, + { + "epoch": 0.8925816023738873, + "grad_norm": 0.5345503091812134, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7546734809875488, + "num_tokens": 721060396.0, + "step": 1504 + }, + { + "epoch": 0.8931750741839762, + "grad_norm": 0.5607869625091553, + "learning_rate": 1e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7725856304168701, + "num_tokens": 721521776.0, + "step": 1505 + }, + { + "epoch": 0.8937685459940653, + "grad_norm": 0.5592580437660217, + "learning_rate": 1e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7515071034431458, + "num_tokens": 722027461.0, + "step": 1506 + }, + { + "epoch": 0.8943620178041543, + "grad_norm": 0.5319221019744873, + "learning_rate": 1e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.773848831653595, + "num_tokens": 722498153.0, + "step": 1507 + }, + { + "epoch": 0.8949554896142433, + "grad_norm": 0.5459402799606323, + "learning_rate": 1e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.7585784196853638, + "num_tokens": 723018168.0, + "step": 1508 + }, + { + "epoch": 0.8955489614243324, + "grad_norm": 0.5282476544380188, + "learning_rate": 1e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.77488112449646, + "num_tokens": 723500928.0, + "step": 1509 + }, + { + "epoch": 0.8961424332344213, + "grad_norm": 0.5637136697769165, + "learning_rate": 1e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.7620590329170227, + "num_tokens": 723930604.0, + "step": 1510 + }, + { + "epoch": 0.8967359050445104, + "grad_norm": 0.5124623775482178, + "learning_rate": 1e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7604491114616394, + "num_tokens": 724454645.0, + "step": 1511 + }, + { + "epoch": 0.8973293768545995, + "grad_norm": 0.5381774306297302, + "learning_rate": 1e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7574223875999451, + "num_tokens": 724966805.0, + "step": 1512 + }, + { + "epoch": 0.8979228486646884, + "grad_norm": 0.540720522403717, + "learning_rate": 1e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7586890459060669, + "num_tokens": 725439875.0, + "step": 1513 + }, + { + "epoch": 0.8985163204747775, + "grad_norm": 0.5417417883872986, + "learning_rate": 1e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7491832971572876, + "num_tokens": 725923591.0, + "step": 1514 + }, + { + "epoch": 0.8991097922848664, + "grad_norm": 0.5633677840232849, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7665624618530273, + "num_tokens": 726363154.0, + "step": 1515 + }, + { + "epoch": 0.8997032640949555, + "grad_norm": 0.5696761608123779, + "learning_rate": 1e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7582894563674927, + "num_tokens": 726809409.0, + "step": 1516 + }, + { + "epoch": 0.9002967359050446, + "grad_norm": 0.5313928723335266, + "learning_rate": 1e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7741528749465942, + "num_tokens": 727318550.0, + "step": 1517 + }, + { + "epoch": 0.9008902077151335, + "grad_norm": 0.5381218791007996, + "learning_rate": 1e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7568835616111755, + "num_tokens": 727842710.0, + "step": 1518 + }, + { + "epoch": 0.9014836795252226, + "grad_norm": 0.543701171875, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7414971590042114, + "num_tokens": 728314030.0, + "step": 1519 + }, + { + "epoch": 0.9020771513353115, + "grad_norm": 0.5437642335891724, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7490438222885132, + "num_tokens": 728791004.0, + "step": 1520 + }, + { + "epoch": 0.9026706231454006, + "grad_norm": 0.5366199016571045, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7631645202636719, + "num_tokens": 729329346.0, + "step": 1521 + }, + { + "epoch": 0.9032640949554896, + "grad_norm": 0.5601711273193359, + "learning_rate": 1e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7551261186599731, + "num_tokens": 729816739.0, + "step": 1522 + }, + { + "epoch": 0.9038575667655786, + "grad_norm": 0.5727477669715881, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7621539235115051, + "num_tokens": 730327711.0, + "step": 1523 + }, + { + "epoch": 0.9044510385756677, + "grad_norm": 0.5295612215995789, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7504773139953613, + "num_tokens": 730823442.0, + "step": 1524 + }, + { + "epoch": 0.9050445103857567, + "grad_norm": 0.5539984107017517, + "learning_rate": 1e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7532392740249634, + "num_tokens": 731291138.0, + "step": 1525 + }, + { + "epoch": 0.9056379821958457, + "grad_norm": 0.5599523186683655, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7607249021530151, + "num_tokens": 731760499.0, + "step": 1526 + }, + { + "epoch": 0.9062314540059347, + "grad_norm": 0.5511245131492615, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7509504556655884, + "num_tokens": 732224558.0, + "step": 1527 + }, + { + "epoch": 0.9068249258160237, + "grad_norm": 0.5507809519767761, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7653043866157532, + "num_tokens": 732696583.0, + "step": 1528 + }, + { + "epoch": 0.9074183976261128, + "grad_norm": 0.5316858887672424, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7653656601905823, + "num_tokens": 733174293.0, + "step": 1529 + }, + { + "epoch": 0.9080118694362018, + "grad_norm": 0.5878714919090271, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7376521825790405, + "num_tokens": 733646555.0, + "step": 1530 + }, + { + "epoch": 0.9086053412462908, + "grad_norm": 0.5771299600601196, + "learning_rate": 1e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.763876736164093, + "num_tokens": 734077896.0, + "step": 1531 + }, + { + "epoch": 0.9091988130563798, + "grad_norm": 0.6117068529129028, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7576620578765869, + "num_tokens": 734530800.0, + "step": 1532 + }, + { + "epoch": 0.9097922848664688, + "grad_norm": 0.5251708030700684, + "learning_rate": 1e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7775542736053467, + "num_tokens": 735048557.0, + "step": 1533 + }, + { + "epoch": 0.9103857566765579, + "grad_norm": 0.5521098971366882, + "learning_rate": 1e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7609585523605347, + "num_tokens": 735533977.0, + "step": 1534 + }, + { + "epoch": 0.9109792284866469, + "grad_norm": 0.5594976544380188, + "learning_rate": 1e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7594693303108215, + "num_tokens": 735963663.0, + "step": 1535 + }, + { + "epoch": 0.9115727002967359, + "grad_norm": 0.5555779337882996, + "learning_rate": 1e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7615519165992737, + "num_tokens": 736406074.0, + "step": 1536 + }, + { + "epoch": 0.9121661721068249, + "grad_norm": 0.5496185421943665, + "learning_rate": 1e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7555907964706421, + "num_tokens": 736863846.0, + "step": 1537 + }, + { + "epoch": 0.912759643916914, + "grad_norm": 0.5894923806190491, + "learning_rate": 1e-06, + "loss": 0.7898, + "mean_token_accuracy": 0.7552375793457031, + "num_tokens": 737319595.0, + "step": 1538 + }, + { + "epoch": 0.913353115727003, + "grad_norm": 0.5814457535743713, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7633154392242432, + "num_tokens": 737783733.0, + "step": 1539 + }, + { + "epoch": 0.913946587537092, + "grad_norm": 0.5328832268714905, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7746254205703735, + "num_tokens": 738294698.0, + "step": 1540 + }, + { + "epoch": 0.914540059347181, + "grad_norm": 0.5648860931396484, + "learning_rate": 1e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7605579495429993, + "num_tokens": 738737325.0, + "step": 1541 + }, + { + "epoch": 0.91513353115727, + "grad_norm": 0.591955304145813, + "learning_rate": 1e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7570184469223022, + "num_tokens": 739194220.0, + "step": 1542 + }, + { + "epoch": 0.9157270029673591, + "grad_norm": 0.5516431331634521, + "learning_rate": 1e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7566937208175659, + "num_tokens": 739650358.0, + "step": 1543 + }, + { + "epoch": 0.916320474777448, + "grad_norm": 0.5208805203437805, + "learning_rate": 1e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.7640206813812256, + "num_tokens": 740153293.0, + "step": 1544 + }, + { + "epoch": 0.9169139465875371, + "grad_norm": 0.5434961915016174, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7666985392570496, + "num_tokens": 740625252.0, + "step": 1545 + }, + { + "epoch": 0.9175074183976261, + "grad_norm": 0.529486358165741, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7518279552459717, + "num_tokens": 741121349.0, + "step": 1546 + }, + { + "epoch": 0.9181008902077151, + "grad_norm": 0.5589212775230408, + "learning_rate": 1e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7644977569580078, + "num_tokens": 741593701.0, + "step": 1547 + }, + { + "epoch": 0.9186943620178042, + "grad_norm": 0.5767490863800049, + "learning_rate": 1e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7517935037612915, + "num_tokens": 742039759.0, + "step": 1548 + }, + { + "epoch": 0.9192878338278931, + "grad_norm": 0.5632396936416626, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7575948238372803, + "num_tokens": 742526739.0, + "step": 1549 + }, + { + "epoch": 0.9198813056379822, + "grad_norm": 0.4983837604522705, + "learning_rate": 1e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7596002817153931, + "num_tokens": 743054608.0, + "step": 1550 + }, + { + "epoch": 0.9204747774480713, + "grad_norm": 0.5103604197502136, + "learning_rate": 1e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7613903284072876, + "num_tokens": 743578734.0, + "step": 1551 + }, + { + "epoch": 0.9210682492581602, + "grad_norm": 0.5449531674385071, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7702322006225586, + "num_tokens": 744036194.0, + "step": 1552 + }, + { + "epoch": 0.9216617210682493, + "grad_norm": 0.5209944248199463, + "learning_rate": 1e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7590065002441406, + "num_tokens": 744519704.0, + "step": 1553 + }, + { + "epoch": 0.9222551928783382, + "grad_norm": 0.525082528591156, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.758059024810791, + "num_tokens": 745010526.0, + "step": 1554 + }, + { + "epoch": 0.9228486646884273, + "grad_norm": 0.520557701587677, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7650214433670044, + "num_tokens": 745520867.0, + "step": 1555 + }, + { + "epoch": 0.9234421364985164, + "grad_norm": 0.5336942672729492, + "learning_rate": 1e-06, + "loss": 0.7303, + "mean_token_accuracy": 0.7700423002243042, + "num_tokens": 745981921.0, + "step": 1556 + }, + { + "epoch": 0.9240356083086053, + "grad_norm": 0.5187477469444275, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.754151463508606, + "num_tokens": 746480371.0, + "step": 1557 + }, + { + "epoch": 0.9246290801186944, + "grad_norm": 0.5756285190582275, + "learning_rate": 1e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7551992535591125, + "num_tokens": 746939786.0, + "step": 1558 + }, + { + "epoch": 0.9252225519287833, + "grad_norm": 0.5560184121131897, + "learning_rate": 1e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7493891716003418, + "num_tokens": 747389319.0, + "step": 1559 + }, + { + "epoch": 0.9258160237388724, + "grad_norm": 0.5552143454551697, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7480379939079285, + "num_tokens": 747884944.0, + "step": 1560 + }, + { + "epoch": 0.9264094955489615, + "grad_norm": 0.5004568696022034, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7645925283432007, + "num_tokens": 748449917.0, + "step": 1561 + }, + { + "epoch": 0.9270029673590504, + "grad_norm": 0.5316395163536072, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7739925384521484, + "num_tokens": 748927318.0, + "step": 1562 + }, + { + "epoch": 0.9275964391691395, + "grad_norm": 0.5186316967010498, + "learning_rate": 1e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.775190532207489, + "num_tokens": 749416906.0, + "step": 1563 + }, + { + "epoch": 0.9281899109792285, + "grad_norm": 0.4997091293334961, + "learning_rate": 1e-06, + "loss": 0.762, + "mean_token_accuracy": 0.7618128061294556, + "num_tokens": 749948834.0, + "step": 1564 + }, + { + "epoch": 0.9287833827893175, + "grad_norm": 0.5435897707939148, + "learning_rate": 1e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7521113157272339, + "num_tokens": 750470357.0, + "step": 1565 + }, + { + "epoch": 0.9293768545994066, + "grad_norm": 0.521462082862854, + "learning_rate": 1e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7671923637390137, + "num_tokens": 750980958.0, + "step": 1566 + }, + { + "epoch": 0.9299703264094955, + "grad_norm": 0.5285443067550659, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7742195725440979, + "num_tokens": 751465192.0, + "step": 1567 + }, + { + "epoch": 0.9305637982195846, + "grad_norm": 0.500751793384552, + "learning_rate": 1e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7652918100357056, + "num_tokens": 751967928.0, + "step": 1568 + }, + { + "epoch": 0.9311572700296736, + "grad_norm": 0.5783628225326538, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7628962993621826, + "num_tokens": 752450478.0, + "step": 1569 + }, + { + "epoch": 0.9317507418397626, + "grad_norm": 0.5443576574325562, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7663370966911316, + "num_tokens": 752901720.0, + "step": 1570 + }, + { + "epoch": 0.9323442136498516, + "grad_norm": 0.5053706169128418, + "learning_rate": 1e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7699345350265503, + "num_tokens": 753404491.0, + "step": 1571 + }, + { + "epoch": 0.9329376854599406, + "grad_norm": 0.509876012802124, + "learning_rate": 1e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7594992518424988, + "num_tokens": 753900181.0, + "step": 1572 + }, + { + "epoch": 0.9335311572700297, + "grad_norm": 0.5266136527061462, + "learning_rate": 1e-06, + "loss": 0.7159, + "mean_token_accuracy": 0.7751781940460205, + "num_tokens": 754380188.0, + "step": 1573 + }, + { + "epoch": 0.9341246290801187, + "grad_norm": 0.5328490138053894, + "learning_rate": 1e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7725884914398193, + "num_tokens": 754864627.0, + "step": 1574 + }, + { + "epoch": 0.9347181008902077, + "grad_norm": 0.5264351963996887, + "learning_rate": 1e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7526501417160034, + "num_tokens": 755373036.0, + "step": 1575 + }, + { + "epoch": 0.9353115727002967, + "grad_norm": 0.5393479466438293, + "learning_rate": 1e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.759579062461853, + "num_tokens": 755840242.0, + "step": 1576 + }, + { + "epoch": 0.9359050445103858, + "grad_norm": 0.5350327491760254, + "learning_rate": 1e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7574886679649353, + "num_tokens": 756278813.0, + "step": 1577 + }, + { + "epoch": 0.9364985163204748, + "grad_norm": 0.560974657535553, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7484046816825867, + "num_tokens": 756739663.0, + "step": 1578 + }, + { + "epoch": 0.9370919881305638, + "grad_norm": 0.5430423021316528, + "learning_rate": 1e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.7701612114906311, + "num_tokens": 757232461.0, + "step": 1579 + }, + { + "epoch": 0.9376854599406528, + "grad_norm": 0.5283769369125366, + "learning_rate": 1e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7556412220001221, + "num_tokens": 757708656.0, + "step": 1580 + }, + { + "epoch": 0.9382789317507418, + "grad_norm": 0.5464017987251282, + "learning_rate": 1e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7595952749252319, + "num_tokens": 758189891.0, + "step": 1581 + }, + { + "epoch": 0.9388724035608309, + "grad_norm": 0.5600082278251648, + "learning_rate": 1e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7654635310173035, + "num_tokens": 758650447.0, + "step": 1582 + }, + { + "epoch": 0.9394658753709199, + "grad_norm": 0.5603934526443481, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7368178367614746, + "num_tokens": 759100862.0, + "step": 1583 + }, + { + "epoch": 0.9400593471810089, + "grad_norm": 0.5326659083366394, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7512797117233276, + "num_tokens": 759590101.0, + "step": 1584 + }, + { + "epoch": 0.9406528189910979, + "grad_norm": 0.5895691514015198, + "learning_rate": 1e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.7600436210632324, + "num_tokens": 760018968.0, + "step": 1585 + }, + { + "epoch": 0.9412462908011869, + "grad_norm": 0.5497466325759888, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7696753740310669, + "num_tokens": 760494698.0, + "step": 1586 + }, + { + "epoch": 0.941839762611276, + "grad_norm": 0.542377769947052, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.756959855556488, + "num_tokens": 760955524.0, + "step": 1587 + }, + { + "epoch": 0.942433234421365, + "grad_norm": 0.5336489081382751, + "learning_rate": 1e-06, + "loss": 0.7821, + "mean_token_accuracy": 0.7561060190200806, + "num_tokens": 761449206.0, + "step": 1588 + }, + { + "epoch": 0.943026706231454, + "grad_norm": 0.5629222989082336, + "learning_rate": 1e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7570140361785889, + "num_tokens": 761930431.0, + "step": 1589 + }, + { + "epoch": 0.9436201780415431, + "grad_norm": 0.5306991934776306, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7707113027572632, + "num_tokens": 762389876.0, + "step": 1590 + }, + { + "epoch": 0.944213649851632, + "grad_norm": 0.5672734379768372, + "learning_rate": 1e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7549717426300049, + "num_tokens": 762861159.0, + "step": 1591 + }, + { + "epoch": 0.9448071216617211, + "grad_norm": 0.5175654888153076, + "learning_rate": 1e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7507491707801819, + "num_tokens": 763377361.0, + "step": 1592 + }, + { + "epoch": 0.94540059347181, + "grad_norm": 0.5447734594345093, + "learning_rate": 1e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7510138750076294, + "num_tokens": 763833101.0, + "step": 1593 + }, + { + "epoch": 0.9459940652818991, + "grad_norm": 0.5326209664344788, + "learning_rate": 1e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.7539682388305664, + "num_tokens": 764313831.0, + "step": 1594 + }, + { + "epoch": 0.9465875370919882, + "grad_norm": 0.5429404377937317, + "learning_rate": 1e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.745787501335144, + "num_tokens": 764823225.0, + "step": 1595 + }, + { + "epoch": 0.9471810089020771, + "grad_norm": 0.5995506644248962, + "learning_rate": 1e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7619467377662659, + "num_tokens": 765250306.0, + "step": 1596 + }, + { + "epoch": 0.9477744807121662, + "grad_norm": 0.5840301513671875, + "learning_rate": 1e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7558294534683228, + "num_tokens": 765666182.0, + "step": 1597 + }, + { + "epoch": 0.9483679525222551, + "grad_norm": 0.5138015747070312, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7598245143890381, + "num_tokens": 766175637.0, + "step": 1598 + }, + { + "epoch": 0.9489614243323442, + "grad_norm": 0.5314165353775024, + "learning_rate": 1e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7592166662216187, + "num_tokens": 766659357.0, + "step": 1599 + }, + { + "epoch": 0.9495548961424333, + "grad_norm": 0.5866967439651489, + "learning_rate": 1e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7520401477813721, + "num_tokens": 767058047.0, + "step": 1600 + }, + { + "epoch": 0.9501483679525222, + "grad_norm": 0.5723278522491455, + "learning_rate": 1e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7655285596847534, + "num_tokens": 767501233.0, + "step": 1601 + }, + { + "epoch": 0.9507418397626113, + "grad_norm": 0.529045045375824, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7627585530281067, + "num_tokens": 768008976.0, + "step": 1602 + }, + { + "epoch": 0.9513353115727003, + "grad_norm": 0.5467821955680847, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.748756468296051, + "num_tokens": 768484906.0, + "step": 1603 + }, + { + "epoch": 0.9519287833827893, + "grad_norm": 0.5410948395729065, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7678030729293823, + "num_tokens": 768948572.0, + "step": 1604 + }, + { + "epoch": 0.9525222551928784, + "grad_norm": 0.5518943071365356, + "learning_rate": 1e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7635501623153687, + "num_tokens": 769421663.0, + "step": 1605 + }, + { + "epoch": 0.9531157270029673, + "grad_norm": 0.5570685863494873, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7394133806228638, + "num_tokens": 769848095.0, + "step": 1606 + }, + { + "epoch": 0.9537091988130564, + "grad_norm": 0.5355701446533203, + "learning_rate": 1e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.7624578475952148, + "num_tokens": 770349011.0, + "step": 1607 + }, + { + "epoch": 0.9543026706231454, + "grad_norm": 0.5410164594650269, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7575297951698303, + "num_tokens": 770813608.0, + "step": 1608 + }, + { + "epoch": 0.9548961424332344, + "grad_norm": 0.5422949194908142, + "learning_rate": 1e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7602862119674683, + "num_tokens": 771275371.0, + "step": 1609 + }, + { + "epoch": 0.9554896142433235, + "grad_norm": 0.5302476286888123, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.767841100692749, + "num_tokens": 771783584.0, + "step": 1610 + }, + { + "epoch": 0.9560830860534124, + "grad_norm": 0.5133563280105591, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7634943127632141, + "num_tokens": 772261901.0, + "step": 1611 + }, + { + "epoch": 0.9566765578635015, + "grad_norm": 0.5216814279556274, + "learning_rate": 1e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7598930597305298, + "num_tokens": 772746648.0, + "step": 1612 + }, + { + "epoch": 0.9572700296735905, + "grad_norm": 0.5243682861328125, + "learning_rate": 1e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7687243223190308, + "num_tokens": 773263647.0, + "step": 1613 + }, + { + "epoch": 0.9578635014836795, + "grad_norm": 0.5307537317276001, + "learning_rate": 1e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7646093964576721, + "num_tokens": 773763091.0, + "step": 1614 + }, + { + "epoch": 0.9584569732937686, + "grad_norm": 0.5467987060546875, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7569007873535156, + "num_tokens": 774224103.0, + "step": 1615 + }, + { + "epoch": 0.9590504451038576, + "grad_norm": 0.5024445652961731, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7620811462402344, + "num_tokens": 774732849.0, + "step": 1616 + }, + { + "epoch": 0.9596439169139466, + "grad_norm": 0.5327184200286865, + "learning_rate": 1e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7733784914016724, + "num_tokens": 775192165.0, + "step": 1617 + }, + { + "epoch": 0.9602373887240356, + "grad_norm": 0.5166491270065308, + "learning_rate": 1e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7625466585159302, + "num_tokens": 775692905.0, + "step": 1618 + }, + { + "epoch": 0.9608308605341246, + "grad_norm": 0.5743899345397949, + "learning_rate": 1e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7615706324577332, + "num_tokens": 776132302.0, + "step": 1619 + }, + { + "epoch": 0.9614243323442137, + "grad_norm": 0.5521530508995056, + "learning_rate": 1e-06, + "loss": 0.7962, + "mean_token_accuracy": 0.7549507021903992, + "num_tokens": 776566037.0, + "step": 1620 + }, + { + "epoch": 0.9620178041543027, + "grad_norm": 0.5288813710212708, + "learning_rate": 1e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7553654909133911, + "num_tokens": 777056968.0, + "step": 1621 + }, + { + "epoch": 0.9626112759643917, + "grad_norm": 0.5510229468345642, + "learning_rate": 1e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7549718022346497, + "num_tokens": 777514876.0, + "step": 1622 + }, + { + "epoch": 0.9632047477744807, + "grad_norm": 0.5519340634346008, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7630484104156494, + "num_tokens": 777974713.0, + "step": 1623 + }, + { + "epoch": 0.9637982195845697, + "grad_norm": 0.5496743321418762, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7629575729370117, + "num_tokens": 778453488.0, + "step": 1624 + }, + { + "epoch": 0.9643916913946587, + "grad_norm": 0.5305376052856445, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7540121078491211, + "num_tokens": 778962927.0, + "step": 1625 + }, + { + "epoch": 0.9649851632047478, + "grad_norm": 0.5552239418029785, + "learning_rate": 1e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7645347118377686, + "num_tokens": 779410598.0, + "step": 1626 + }, + { + "epoch": 0.9655786350148368, + "grad_norm": 0.5547143220901489, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7601134777069092, + "num_tokens": 779843966.0, + "step": 1627 + }, + { + "epoch": 0.9661721068249258, + "grad_norm": 0.5192576050758362, + "learning_rate": 1e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7600194811820984, + "num_tokens": 780337208.0, + "step": 1628 + }, + { + "epoch": 0.9667655786350149, + "grad_norm": 0.53577721118927, + "learning_rate": 1e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7711758017539978, + "num_tokens": 780798596.0, + "step": 1629 + }, + { + "epoch": 0.9673590504451038, + "grad_norm": 0.578185498714447, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7793102860450745, + "num_tokens": 781247362.0, + "step": 1630 + }, + { + "epoch": 0.9679525222551929, + "grad_norm": 0.5253727436065674, + "learning_rate": 1e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7622373104095459, + "num_tokens": 781747553.0, + "step": 1631 + }, + { + "epoch": 0.9685459940652819, + "grad_norm": 0.5186571478843689, + "learning_rate": 1e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7695781588554382, + "num_tokens": 782206997.0, + "step": 1632 + }, + { + "epoch": 0.9691394658753709, + "grad_norm": 0.5514636635780334, + "learning_rate": 1e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7574137449264526, + "num_tokens": 782681567.0, + "step": 1633 + }, + { + "epoch": 0.96973293768546, + "grad_norm": 0.5432469248771667, + "learning_rate": 1e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7585774064064026, + "num_tokens": 783173342.0, + "step": 1634 + }, + { + "epoch": 0.9703264094955489, + "grad_norm": 0.520592212677002, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.770404577255249, + "num_tokens": 783675313.0, + "step": 1635 + }, + { + "epoch": 0.970919881305638, + "grad_norm": 0.5242982506752014, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7708690762519836, + "num_tokens": 784180695.0, + "step": 1636 + }, + { + "epoch": 0.971513353115727, + "grad_norm": 0.5760998725891113, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7420740127563477, + "num_tokens": 784648855.0, + "step": 1637 + }, + { + "epoch": 0.972106824925816, + "grad_norm": 0.5406936407089233, + "learning_rate": 1e-06, + "loss": 0.7428, + "mean_token_accuracy": 0.7647721171379089, + "num_tokens": 785127362.0, + "step": 1638 + }, + { + "epoch": 0.9727002967359051, + "grad_norm": 0.5267645120620728, + "learning_rate": 1e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7573807239532471, + "num_tokens": 785628221.0, + "step": 1639 + }, + { + "epoch": 0.973293768545994, + "grad_norm": 0.5376405119895935, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7660761475563049, + "num_tokens": 786073972.0, + "step": 1640 + }, + { + "epoch": 0.9738872403560831, + "grad_norm": 0.5317971110343933, + "learning_rate": 1e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7590672969818115, + "num_tokens": 786555306.0, + "step": 1641 + }, + { + "epoch": 0.9744807121661722, + "grad_norm": 0.5445658564567566, + "learning_rate": 1e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7581769227981567, + "num_tokens": 787037136.0, + "step": 1642 + }, + { + "epoch": 0.9750741839762611, + "grad_norm": 0.5590937733650208, + "learning_rate": 1e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7586623430252075, + "num_tokens": 787487979.0, + "step": 1643 + }, + { + "epoch": 0.9756676557863502, + "grad_norm": 0.515815258026123, + "learning_rate": 1e-06, + "loss": 0.7334, + "mean_token_accuracy": 0.7702763080596924, + "num_tokens": 787987170.0, + "step": 1644 + }, + { + "epoch": 0.9762611275964391, + "grad_norm": 0.5420960187911987, + "learning_rate": 1e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.766739010810852, + "num_tokens": 788445972.0, + "step": 1645 + }, + { + "epoch": 0.9768545994065282, + "grad_norm": 0.548558235168457, + "learning_rate": 1e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7541842460632324, + "num_tokens": 788933702.0, + "step": 1646 + }, + { + "epoch": 0.9774480712166173, + "grad_norm": 0.523094654083252, + "learning_rate": 1e-06, + "loss": 0.7952, + "mean_token_accuracy": 0.7549627423286438, + "num_tokens": 789479801.0, + "step": 1647 + }, + { + "epoch": 0.9780415430267062, + "grad_norm": 0.5749934315681458, + "learning_rate": 1e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.753183901309967, + "num_tokens": 789960668.0, + "step": 1648 + }, + { + "epoch": 0.9786350148367953, + "grad_norm": 0.5554081797599792, + "learning_rate": 1e-06, + "loss": 0.762, + "mean_token_accuracy": 0.7602640986442566, + "num_tokens": 790395895.0, + "step": 1649 + }, + { + "epoch": 0.9792284866468842, + "grad_norm": 0.6119580864906311, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7622846364974976, + "num_tokens": 790864638.0, + "step": 1650 + }, + { + "epoch": 0.9798219584569733, + "grad_norm": 0.5641713738441467, + "learning_rate": 1e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.757200300693512, + "num_tokens": 791360839.0, + "step": 1651 + }, + { + "epoch": 0.9804154302670623, + "grad_norm": 0.5675839781761169, + "learning_rate": 1e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7557008266448975, + "num_tokens": 791782171.0, + "step": 1652 + }, + { + "epoch": 0.9810089020771513, + "grad_norm": 0.538659930229187, + "learning_rate": 1e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7705981731414795, + "num_tokens": 792253333.0, + "step": 1653 + }, + { + "epoch": 0.9816023738872404, + "grad_norm": 0.5806061625480652, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7545512914657593, + "num_tokens": 792776441.0, + "step": 1654 + }, + { + "epoch": 0.9821958456973294, + "grad_norm": 0.529418408870697, + "learning_rate": 1e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.7589226961135864, + "num_tokens": 793277493.0, + "step": 1655 + }, + { + "epoch": 0.9827893175074184, + "grad_norm": 0.5577471256256104, + "learning_rate": 1e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7560131549835205, + "num_tokens": 793766949.0, + "step": 1656 + }, + { + "epoch": 0.9833827893175074, + "grad_norm": 0.5239621996879578, + "learning_rate": 1e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7569408416748047, + "num_tokens": 794263278.0, + "step": 1657 + }, + { + "epoch": 0.9839762611275964, + "grad_norm": 0.5466590523719788, + "learning_rate": 1e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7656702995300293, + "num_tokens": 794747078.0, + "step": 1658 + }, + { + "epoch": 0.9845697329376855, + "grad_norm": 0.5459341406822205, + "learning_rate": 1e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7627601623535156, + "num_tokens": 795196203.0, + "step": 1659 + }, + { + "epoch": 0.9851632047477745, + "grad_norm": 0.6604257822036743, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7564415335655212, + "num_tokens": 795698604.0, + "step": 1660 + }, + { + "epoch": 0.9857566765578635, + "grad_norm": 0.5422782897949219, + "learning_rate": 1e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7659285068511963, + "num_tokens": 796199414.0, + "step": 1661 + }, + { + "epoch": 0.9863501483679525, + "grad_norm": 0.5506287813186646, + "learning_rate": 1e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7518981695175171, + "num_tokens": 796672795.0, + "step": 1662 + }, + { + "epoch": 0.9869436201780415, + "grad_norm": 0.5387349724769592, + "learning_rate": 1e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7615590691566467, + "num_tokens": 797192029.0, + "step": 1663 + }, + { + "epoch": 0.9875370919881306, + "grad_norm": 0.5688586831092834, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7555402517318726, + "num_tokens": 797643043.0, + "step": 1664 + }, + { + "epoch": 0.9881305637982196, + "grad_norm": 0.5137168169021606, + "learning_rate": 1e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7516326904296875, + "num_tokens": 798128876.0, + "step": 1665 + }, + { + "epoch": 0.9887240356083086, + "grad_norm": 0.5469667911529541, + "learning_rate": 1e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7529431581497192, + "num_tokens": 798589703.0, + "step": 1666 + }, + { + "epoch": 0.9893175074183976, + "grad_norm": 0.5894489884376526, + "learning_rate": 1e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.756861686706543, + "num_tokens": 799045574.0, + "step": 1667 + }, + { + "epoch": 0.9899109792284867, + "grad_norm": 0.5749348402023315, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7673517465591431, + "num_tokens": 799495241.0, + "step": 1668 + }, + { + "epoch": 0.9905044510385757, + "grad_norm": 0.5029959082603455, + "learning_rate": 1e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7526750564575195, + "num_tokens": 800018725.0, + "step": 1669 + }, + { + "epoch": 0.9910979228486647, + "grad_norm": 0.5725580453872681, + "learning_rate": 1e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7742125391960144, + "num_tokens": 800500865.0, + "step": 1670 + }, + { + "epoch": 0.9916913946587537, + "grad_norm": 0.5429513454437256, + "learning_rate": 1e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7679045796394348, + "num_tokens": 800979705.0, + "step": 1671 + }, + { + "epoch": 0.9922848664688427, + "grad_norm": 0.5384284853935242, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7645429372787476, + "num_tokens": 801453100.0, + "step": 1672 + }, + { + "epoch": 0.9928783382789318, + "grad_norm": 0.5343916416168213, + "learning_rate": 1e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7680396437644958, + "num_tokens": 801921234.0, + "step": 1673 + }, + { + "epoch": 0.9934718100890207, + "grad_norm": 0.5564929246902466, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7576865553855896, + "num_tokens": 802370453.0, + "step": 1674 + }, + { + "epoch": 0.9940652818991098, + "grad_norm": 0.5252459645271301, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7614521384239197, + "num_tokens": 802856918.0, + "step": 1675 + }, + { + "epoch": 0.9946587537091988, + "grad_norm": 0.5070993900299072, + "learning_rate": 1e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7765877842903137, + "num_tokens": 803378839.0, + "step": 1676 + }, + { + "epoch": 0.9952522255192878, + "grad_norm": 0.5278406143188477, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7525537610054016, + "num_tokens": 803874923.0, + "step": 1677 + }, + { + "epoch": 0.9958456973293769, + "grad_norm": 0.5719795227050781, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7442749738693237, + "num_tokens": 804377614.0, + "step": 1678 + }, + { + "epoch": 0.9964391691394658, + "grad_norm": 0.5343385934829712, + "learning_rate": 1e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.7788057327270508, + "num_tokens": 804842212.0, + "step": 1679 + }, + { + "epoch": 0.9970326409495549, + "grad_norm": 0.5668982863426208, + "learning_rate": 1e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.753895103931427, + "num_tokens": 805278111.0, + "step": 1680 + }, + { + "epoch": 0.997626112759644, + "grad_norm": 0.5509381890296936, + "learning_rate": 1e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7617774605751038, + "num_tokens": 805731547.0, + "step": 1681 + }, + { + "epoch": 0.9982195845697329, + "grad_norm": 0.5697535276412964, + "learning_rate": 1e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7650307416915894, + "num_tokens": 806198323.0, + "step": 1682 + }, + { + "epoch": 0.998813056379822, + "grad_norm": 0.5295347571372986, + "learning_rate": 1e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7550756931304932, + "num_tokens": 806702349.0, + "step": 1683 + }, + { + "epoch": 0.9994065281899109, + "grad_norm": 0.5726540684700012, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7554073333740234, + "num_tokens": 807130593.0, + "step": 1684 + }, + { + "epoch": 1.0, + "grad_norm": 0.5224700570106506, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7503358125686646, + "num_tokens": 807646900.0, + "step": 1685 + }, + { + "epoch": 1.000593471810089, + "grad_norm": 0.5410070419311523, + "learning_rate": 1e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7535852193832397, + "num_tokens": 808161475.0, + "step": 1686 + }, + { + "epoch": 1.0011869436201781, + "grad_norm": 0.5106633305549622, + "learning_rate": 1e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7682991027832031, + "num_tokens": 808726944.0, + "step": 1687 + }, + { + "epoch": 1.001780415430267, + "grad_norm": 0.5318490266799927, + "learning_rate": 1e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7575491666793823, + "num_tokens": 809216714.0, + "step": 1688 + }, + { + "epoch": 1.002373887240356, + "grad_norm": 0.5650784969329834, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7468698024749756, + "num_tokens": 809675175.0, + "step": 1689 + }, + { + "epoch": 1.002967359050445, + "grad_norm": 0.5735985636711121, + "learning_rate": 1e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7653334736824036, + "num_tokens": 810133699.0, + "step": 1690 + }, + { + "epoch": 1.0035608308605342, + "grad_norm": 0.6023754477500916, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7586929798126221, + "num_tokens": 810632993.0, + "step": 1691 + }, + { + "epoch": 1.0041543026706232, + "grad_norm": 0.5677465200424194, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7689091563224792, + "num_tokens": 811095187.0, + "step": 1692 + }, + { + "epoch": 1.004747774480712, + "grad_norm": 0.5367078185081482, + "learning_rate": 1e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7663195133209229, + "num_tokens": 811572050.0, + "step": 1693 + }, + { + "epoch": 1.0053412462908011, + "grad_norm": 0.560916006565094, + "learning_rate": 1e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7606637477874756, + "num_tokens": 812067787.0, + "step": 1694 + }, + { + "epoch": 1.0059347181008902, + "grad_norm": 0.533454954624176, + "learning_rate": 1e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.7525486350059509, + "num_tokens": 812542332.0, + "step": 1695 + }, + { + "epoch": 1.0065281899109793, + "grad_norm": 0.5698684453964233, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7365829944610596, + "num_tokens": 812980265.0, + "step": 1696 + }, + { + "epoch": 1.0071216617210683, + "grad_norm": 0.5599662065505981, + "learning_rate": 1e-06, + "loss": 0.808, + "mean_token_accuracy": 0.7470954656600952, + "num_tokens": 813446476.0, + "step": 1697 + }, + { + "epoch": 1.0077151335311574, + "grad_norm": 0.5772547125816345, + "learning_rate": 1e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7493866682052612, + "num_tokens": 813912396.0, + "step": 1698 + }, + { + "epoch": 1.0083086053412462, + "grad_norm": 0.5493178963661194, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7682915925979614, + "num_tokens": 814371814.0, + "step": 1699 + }, + { + "epoch": 1.0089020771513353, + "grad_norm": 0.5739766955375671, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7751012444496155, + "num_tokens": 814855235.0, + "step": 1700 + }, + { + "epoch": 1.0094955489614243, + "grad_norm": 0.5504724383354187, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7635532021522522, + "num_tokens": 815323999.0, + "step": 1701 + }, + { + "epoch": 1.0100890207715134, + "grad_norm": 0.5673458576202393, + "learning_rate": 1e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7588492631912231, + "num_tokens": 815765629.0, + "step": 1702 + }, + { + "epoch": 1.0106824925816025, + "grad_norm": 0.5325767397880554, + "learning_rate": 1e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7558234333992004, + "num_tokens": 816265873.0, + "step": 1703 + }, + { + "epoch": 1.0112759643916913, + "grad_norm": 0.5446155071258545, + "learning_rate": 1e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.7575135231018066, + "num_tokens": 816790780.0, + "step": 1704 + }, + { + "epoch": 1.0118694362017804, + "grad_norm": 0.5948062539100647, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7603206634521484, + "num_tokens": 817231933.0, + "step": 1705 + }, + { + "epoch": 1.0124629080118694, + "grad_norm": 0.5685214996337891, + "learning_rate": 1e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7577702403068542, + "num_tokens": 817695884.0, + "step": 1706 + }, + { + "epoch": 1.0130563798219585, + "grad_norm": 0.5386358499526978, + "learning_rate": 1e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7571471929550171, + "num_tokens": 818190508.0, + "step": 1707 + }, + { + "epoch": 1.0136498516320476, + "grad_norm": 0.5370752215385437, + "learning_rate": 1e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7696506381034851, + "num_tokens": 818702351.0, + "step": 1708 + }, + { + "epoch": 1.0142433234421364, + "grad_norm": 0.5534093379974365, + "learning_rate": 1e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7619553804397583, + "num_tokens": 819167700.0, + "step": 1709 + }, + { + "epoch": 1.0148367952522255, + "grad_norm": 0.5515801906585693, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7664186358451843, + "num_tokens": 819614535.0, + "step": 1710 + }, + { + "epoch": 1.0154302670623145, + "grad_norm": 0.5433504581451416, + "learning_rate": 1e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.777991771697998, + "num_tokens": 820084043.0, + "step": 1711 + }, + { + "epoch": 1.0160237388724036, + "grad_norm": 0.5296920537948608, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.756510853767395, + "num_tokens": 820584298.0, + "step": 1712 + }, + { + "epoch": 1.0166172106824927, + "grad_norm": 0.5783188939094543, + "learning_rate": 1e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7536395788192749, + "num_tokens": 821037932.0, + "step": 1713 + }, + { + "epoch": 1.0172106824925815, + "grad_norm": 0.5353294014930725, + "learning_rate": 1e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7726044654846191, + "num_tokens": 821529971.0, + "step": 1714 + }, + { + "epoch": 1.0178041543026706, + "grad_norm": 0.5424361824989319, + "learning_rate": 1e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7788832187652588, + "num_tokens": 822008142.0, + "step": 1715 + }, + { + "epoch": 1.0183976261127596, + "grad_norm": 0.5638760924339294, + "learning_rate": 1e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7610408067703247, + "num_tokens": 822449074.0, + "step": 1716 + }, + { + "epoch": 1.0189910979228487, + "grad_norm": 0.6004260182380676, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.758385181427002, + "num_tokens": 822941682.0, + "step": 1717 + }, + { + "epoch": 1.0195845697329378, + "grad_norm": 0.5536392331123352, + "learning_rate": 1e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7722198367118835, + "num_tokens": 823424162.0, + "step": 1718 + }, + { + "epoch": 1.0201780415430266, + "grad_norm": 0.5329455733299255, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7700947523117065, + "num_tokens": 823914120.0, + "step": 1719 + }, + { + "epoch": 1.0207715133531157, + "grad_norm": 0.6067412495613098, + "learning_rate": 1e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7718048095703125, + "num_tokens": 824354055.0, + "step": 1720 + }, + { + "epoch": 1.0213649851632047, + "grad_norm": 0.5675974488258362, + "learning_rate": 1e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.760468602180481, + "num_tokens": 824831869.0, + "step": 1721 + }, + { + "epoch": 1.0219584569732938, + "grad_norm": 0.5604744553565979, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7520129680633545, + "num_tokens": 825303802.0, + "step": 1722 + }, + { + "epoch": 1.0225519287833829, + "grad_norm": 0.5231877565383911, + "learning_rate": 1e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7718753814697266, + "num_tokens": 825775819.0, + "step": 1723 + }, + { + "epoch": 1.0231454005934717, + "grad_norm": 0.6027208566665649, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7315695285797119, + "num_tokens": 826261660.0, + "step": 1724 + }, + { + "epoch": 1.0237388724035608, + "grad_norm": 0.5929421186447144, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7598832845687866, + "num_tokens": 826718392.0, + "step": 1725 + }, + { + "epoch": 1.0243323442136498, + "grad_norm": 0.5651364326477051, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7435789108276367, + "num_tokens": 827183163.0, + "step": 1726 + }, + { + "epoch": 1.024925816023739, + "grad_norm": 0.5771515965461731, + "learning_rate": 1e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7671876549720764, + "num_tokens": 827649612.0, + "step": 1727 + }, + { + "epoch": 1.025519287833828, + "grad_norm": 0.5356395840644836, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7644445300102234, + "num_tokens": 828179891.0, + "step": 1728 + }, + { + "epoch": 1.026112759643917, + "grad_norm": 0.577251672744751, + "learning_rate": 1e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7696020603179932, + "num_tokens": 828668104.0, + "step": 1729 + }, + { + "epoch": 1.0267062314540059, + "grad_norm": 0.5860411524772644, + "learning_rate": 1e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.7576738595962524, + "num_tokens": 829130032.0, + "step": 1730 + }, + { + "epoch": 1.027299703264095, + "grad_norm": 0.5633941888809204, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7538051605224609, + "num_tokens": 829605822.0, + "step": 1731 + }, + { + "epoch": 1.027893175074184, + "grad_norm": 0.581404983997345, + "learning_rate": 1e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7554176449775696, + "num_tokens": 830087745.0, + "step": 1732 + }, + { + "epoch": 1.028486646884273, + "grad_norm": 0.5827056765556335, + "learning_rate": 1e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.7786067724227905, + "num_tokens": 830564587.0, + "step": 1733 + }, + { + "epoch": 1.0290801186943621, + "grad_norm": 0.5998764634132385, + "learning_rate": 1e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7632697820663452, + "num_tokens": 830986519.0, + "step": 1734 + }, + { + "epoch": 1.029673590504451, + "grad_norm": 0.5444918870925903, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7485129833221436, + "num_tokens": 831483457.0, + "step": 1735 + }, + { + "epoch": 1.03026706231454, + "grad_norm": 0.5717772841453552, + "learning_rate": 1e-06, + "loss": 0.759, + "mean_token_accuracy": 0.760710597038269, + "num_tokens": 831979491.0, + "step": 1736 + }, + { + "epoch": 1.030860534124629, + "grad_norm": 0.5480202436447144, + "learning_rate": 1e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7638977766036987, + "num_tokens": 832507358.0, + "step": 1737 + }, + { + "epoch": 1.0314540059347181, + "grad_norm": 0.5572938323020935, + "learning_rate": 1e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7612788677215576, + "num_tokens": 832995261.0, + "step": 1738 + }, + { + "epoch": 1.0320474777448072, + "grad_norm": 0.549761950969696, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7630702257156372, + "num_tokens": 833438750.0, + "step": 1739 + }, + { + "epoch": 1.032640949554896, + "grad_norm": 0.6162472367286682, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7491747140884399, + "num_tokens": 833891209.0, + "step": 1740 + }, + { + "epoch": 1.0332344213649851, + "grad_norm": 0.5854506492614746, + "learning_rate": 1e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7583498954772949, + "num_tokens": 834338574.0, + "step": 1741 + }, + { + "epoch": 1.0338278931750742, + "grad_norm": 0.5634685158729553, + "learning_rate": 1e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.760195255279541, + "num_tokens": 834784536.0, + "step": 1742 + }, + { + "epoch": 1.0344213649851632, + "grad_norm": 0.583111047744751, + "learning_rate": 1e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7647974491119385, + "num_tokens": 835241215.0, + "step": 1743 + }, + { + "epoch": 1.0350148367952523, + "grad_norm": 0.5408892631530762, + "learning_rate": 1e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7580504417419434, + "num_tokens": 835763399.0, + "step": 1744 + }, + { + "epoch": 1.0356083086053411, + "grad_norm": 0.5857446193695068, + "learning_rate": 1e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7589359879493713, + "num_tokens": 836191906.0, + "step": 1745 + }, + { + "epoch": 1.0362017804154302, + "grad_norm": 0.545958936214447, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7649438381195068, + "num_tokens": 836687762.0, + "step": 1746 + }, + { + "epoch": 1.0367952522255193, + "grad_norm": 0.5412518382072449, + "learning_rate": 1e-06, + "loss": 0.7303, + "mean_token_accuracy": 0.7708466053009033, + "num_tokens": 837151547.0, + "step": 1747 + }, + { + "epoch": 1.0373887240356083, + "grad_norm": 0.5639196634292603, + "learning_rate": 1e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7698410153388977, + "num_tokens": 837642961.0, + "step": 1748 + }, + { + "epoch": 1.0379821958456974, + "grad_norm": 0.5433118939399719, + "learning_rate": 1e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.7649128437042236, + "num_tokens": 838134121.0, + "step": 1749 + }, + { + "epoch": 1.0385756676557865, + "grad_norm": 0.5551385283470154, + "learning_rate": 1e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.763329029083252, + "num_tokens": 838610296.0, + "step": 1750 + }, + { + "epoch": 1.0391691394658753, + "grad_norm": 0.5513039231300354, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7676193714141846, + "num_tokens": 839123821.0, + "step": 1751 + }, + { + "epoch": 1.0397626112759644, + "grad_norm": 0.533871591091156, + "learning_rate": 1e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7589209079742432, + "num_tokens": 839603668.0, + "step": 1752 + }, + { + "epoch": 1.0403560830860534, + "grad_norm": 0.5542751550674438, + "learning_rate": 1e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7586754560470581, + "num_tokens": 840055213.0, + "step": 1753 + }, + { + "epoch": 1.0409495548961425, + "grad_norm": 0.5346413850784302, + "learning_rate": 1e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7597770690917969, + "num_tokens": 840568327.0, + "step": 1754 + }, + { + "epoch": 1.0415430267062316, + "grad_norm": 0.5423884391784668, + "learning_rate": 1e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7746118307113647, + "num_tokens": 841058290.0, + "step": 1755 + }, + { + "epoch": 1.0421364985163204, + "grad_norm": 0.5535617470741272, + "learning_rate": 1e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7755604982376099, + "num_tokens": 841523037.0, + "step": 1756 + }, + { + "epoch": 1.0427299703264095, + "grad_norm": 0.5719828605651855, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7552786469459534, + "num_tokens": 841950485.0, + "step": 1757 + }, + { + "epoch": 1.0433234421364985, + "grad_norm": 0.5228585004806519, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7549527287483215, + "num_tokens": 842474824.0, + "step": 1758 + }, + { + "epoch": 1.0439169139465876, + "grad_norm": 0.5338042378425598, + "learning_rate": 1e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.7696489095687866, + "num_tokens": 842983335.0, + "step": 1759 + }, + { + "epoch": 1.0445103857566767, + "grad_norm": 0.5563154816627502, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7663095593452454, + "num_tokens": 843429416.0, + "step": 1760 + }, + { + "epoch": 1.0451038575667655, + "grad_norm": 0.5304633975028992, + "learning_rate": 1e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.75560462474823, + "num_tokens": 843908789.0, + "step": 1761 + }, + { + "epoch": 1.0456973293768546, + "grad_norm": 0.560330331325531, + "learning_rate": 1e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7690542936325073, + "num_tokens": 844418273.0, + "step": 1762 + }, + { + "epoch": 1.0462908011869436, + "grad_norm": 0.5465802550315857, + "learning_rate": 1e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7681566476821899, + "num_tokens": 844855760.0, + "step": 1763 + }, + { + "epoch": 1.0468842729970327, + "grad_norm": 0.5691860318183899, + "learning_rate": 1e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.769991397857666, + "num_tokens": 845323890.0, + "step": 1764 + }, + { + "epoch": 1.0474777448071217, + "grad_norm": 0.5626463890075684, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7652218341827393, + "num_tokens": 845792808.0, + "step": 1765 + }, + { + "epoch": 1.0480712166172106, + "grad_norm": 0.5344595909118652, + "learning_rate": 1e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7587976455688477, + "num_tokens": 846290528.0, + "step": 1766 + }, + { + "epoch": 1.0486646884272997, + "grad_norm": 0.5517815351486206, + "learning_rate": 1e-06, + "loss": 0.8002, + "mean_token_accuracy": 0.7502868175506592, + "num_tokens": 846777543.0, + "step": 1767 + }, + { + "epoch": 1.0492581602373887, + "grad_norm": 0.5349388718605042, + "learning_rate": 1e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7526179552078247, + "num_tokens": 847285046.0, + "step": 1768 + }, + { + "epoch": 1.0498516320474778, + "grad_norm": 0.5213963389396667, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7559618949890137, + "num_tokens": 847822074.0, + "step": 1769 + }, + { + "epoch": 1.0504451038575668, + "grad_norm": 0.5492247939109802, + "learning_rate": 1e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7750123739242554, + "num_tokens": 848288627.0, + "step": 1770 + }, + { + "epoch": 1.0510385756676557, + "grad_norm": 0.5565081834793091, + "learning_rate": 1e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7557438611984253, + "num_tokens": 848754680.0, + "step": 1771 + }, + { + "epoch": 1.0516320474777447, + "grad_norm": 0.5134739279747009, + "learning_rate": 1e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7700925469398499, + "num_tokens": 849284050.0, + "step": 1772 + }, + { + "epoch": 1.0522255192878338, + "grad_norm": 0.5587220191955566, + "learning_rate": 1e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7652336359024048, + "num_tokens": 849714017.0, + "step": 1773 + }, + { + "epoch": 1.0528189910979229, + "grad_norm": 0.5602039098739624, + "learning_rate": 1e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7613482475280762, + "num_tokens": 850218130.0, + "step": 1774 + }, + { + "epoch": 1.053412462908012, + "grad_norm": 0.5992721915245056, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7453522682189941, + "num_tokens": 850660058.0, + "step": 1775 + }, + { + "epoch": 1.0540059347181008, + "grad_norm": 0.5343345403671265, + "learning_rate": 1e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7719389200210571, + "num_tokens": 851134528.0, + "step": 1776 + }, + { + "epoch": 1.0545994065281898, + "grad_norm": 0.5583794116973877, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7527583837509155, + "num_tokens": 851615983.0, + "step": 1777 + }, + { + "epoch": 1.055192878338279, + "grad_norm": 0.5510520339012146, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7636212110519409, + "num_tokens": 852109522.0, + "step": 1778 + }, + { + "epoch": 1.055786350148368, + "grad_norm": 0.5424116849899292, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7474569082260132, + "num_tokens": 852625163.0, + "step": 1779 + }, + { + "epoch": 1.056379821958457, + "grad_norm": 0.558005690574646, + "learning_rate": 1e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7621662616729736, + "num_tokens": 853079940.0, + "step": 1780 + }, + { + "epoch": 1.056973293768546, + "grad_norm": 0.5571801662445068, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7719720005989075, + "num_tokens": 853519710.0, + "step": 1781 + }, + { + "epoch": 1.057566765578635, + "grad_norm": 0.557288646697998, + "learning_rate": 1e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7658569812774658, + "num_tokens": 854014789.0, + "step": 1782 + }, + { + "epoch": 1.058160237388724, + "grad_norm": 0.5595734715461731, + "learning_rate": 1e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7707737684249878, + "num_tokens": 854505003.0, + "step": 1783 + }, + { + "epoch": 1.058753709198813, + "grad_norm": 0.5607667565345764, + "learning_rate": 1e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7637548446655273, + "num_tokens": 854997334.0, + "step": 1784 + }, + { + "epoch": 1.0593471810089021, + "grad_norm": 0.5368344783782959, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7594162821769714, + "num_tokens": 855486398.0, + "step": 1785 + }, + { + "epoch": 1.0599406528189912, + "grad_norm": 0.5662140846252441, + "learning_rate": 1e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.763141930103302, + "num_tokens": 855931675.0, + "step": 1786 + }, + { + "epoch": 1.06053412462908, + "grad_norm": 0.5262380838394165, + "learning_rate": 1e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7601275444030762, + "num_tokens": 856421795.0, + "step": 1787 + }, + { + "epoch": 1.061127596439169, + "grad_norm": 0.5285840630531311, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7709861397743225, + "num_tokens": 856893106.0, + "step": 1788 + }, + { + "epoch": 1.0617210682492582, + "grad_norm": 0.5473811626434326, + "learning_rate": 1e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7622998952865601, + "num_tokens": 857399547.0, + "step": 1789 + }, + { + "epoch": 1.0623145400593472, + "grad_norm": 0.5301267504692078, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7688583135604858, + "num_tokens": 857875633.0, + "step": 1790 + }, + { + "epoch": 1.0629080118694363, + "grad_norm": 0.5592077970504761, + "learning_rate": 1e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7649303674697876, + "num_tokens": 858356918.0, + "step": 1791 + }, + { + "epoch": 1.0635014836795251, + "grad_norm": 0.5825939178466797, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7601264119148254, + "num_tokens": 858801194.0, + "step": 1792 + }, + { + "epoch": 1.0640949554896142, + "grad_norm": 0.5979990363121033, + "learning_rate": 1e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.7484276294708252, + "num_tokens": 859262431.0, + "step": 1793 + }, + { + "epoch": 1.0646884272997033, + "grad_norm": 0.52132248878479, + "learning_rate": 1e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7546331286430359, + "num_tokens": 859774440.0, + "step": 1794 + }, + { + "epoch": 1.0652818991097923, + "grad_norm": 0.5662804841995239, + "learning_rate": 1e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7686650156974792, + "num_tokens": 860230148.0, + "step": 1795 + }, + { + "epoch": 1.0658753709198814, + "grad_norm": 0.5301393270492554, + "learning_rate": 1e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7733670473098755, + "num_tokens": 860728839.0, + "step": 1796 + }, + { + "epoch": 1.0664688427299702, + "grad_norm": 0.5627714991569519, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7654511332511902, + "num_tokens": 861164759.0, + "step": 1797 + }, + { + "epoch": 1.0670623145400593, + "grad_norm": 0.5589221715927124, + "learning_rate": 1e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.7544710636138916, + "num_tokens": 861616978.0, + "step": 1798 + }, + { + "epoch": 1.0676557863501484, + "grad_norm": 0.5611851215362549, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7667725682258606, + "num_tokens": 862107836.0, + "step": 1799 + }, + { + "epoch": 1.0682492581602374, + "grad_norm": 0.5506826639175415, + "learning_rate": 1e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7697584629058838, + "num_tokens": 862556875.0, + "step": 1800 + }, + { + "epoch": 1.0688427299703265, + "grad_norm": 0.5567286610603333, + "learning_rate": 1e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7634340524673462, + "num_tokens": 863056227.0, + "step": 1801 + }, + { + "epoch": 1.0694362017804155, + "grad_norm": 0.5305694937705994, + "learning_rate": 1e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7661872506141663, + "num_tokens": 863519671.0, + "step": 1802 + }, + { + "epoch": 1.0700296735905044, + "grad_norm": 0.5600711107254028, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.768702507019043, + "num_tokens": 864008179.0, + "step": 1803 + }, + { + "epoch": 1.0706231454005934, + "grad_norm": 0.5577189922332764, + "learning_rate": 1e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7520568370819092, + "num_tokens": 864478055.0, + "step": 1804 + }, + { + "epoch": 1.0712166172106825, + "grad_norm": 0.5384178161621094, + "learning_rate": 1e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7658320665359497, + "num_tokens": 864959913.0, + "step": 1805 + }, + { + "epoch": 1.0718100890207716, + "grad_norm": 0.5728067755699158, + "learning_rate": 1e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7579439878463745, + "num_tokens": 865441561.0, + "step": 1806 + }, + { + "epoch": 1.0724035608308606, + "grad_norm": 0.5308356285095215, + "learning_rate": 1e-06, + "loss": 0.689, + "mean_token_accuracy": 0.7818008661270142, + "num_tokens": 865908873.0, + "step": 1807 + }, + { + "epoch": 1.0729970326409495, + "grad_norm": 0.5541953444480896, + "learning_rate": 1e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7651203870773315, + "num_tokens": 866380125.0, + "step": 1808 + }, + { + "epoch": 1.0735905044510385, + "grad_norm": 0.5298078656196594, + "learning_rate": 1e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7676970958709717, + "num_tokens": 866874034.0, + "step": 1809 + }, + { + "epoch": 1.0741839762611276, + "grad_norm": 0.552553653717041, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.765235185623169, + "num_tokens": 867376952.0, + "step": 1810 + }, + { + "epoch": 1.0747774480712167, + "grad_norm": 0.5401339530944824, + "learning_rate": 1e-06, + "loss": 0.7701, + "mean_token_accuracy": 0.7584007978439331, + "num_tokens": 867866999.0, + "step": 1811 + }, + { + "epoch": 1.0753709198813057, + "grad_norm": 0.5228416919708252, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7627830505371094, + "num_tokens": 868368703.0, + "step": 1812 + }, + { + "epoch": 1.0759643916913946, + "grad_norm": 0.5216745734214783, + "learning_rate": 1e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7774759531021118, + "num_tokens": 868880751.0, + "step": 1813 + }, + { + "epoch": 1.0765578635014836, + "grad_norm": 0.5345560908317566, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7648777365684509, + "num_tokens": 869398334.0, + "step": 1814 + }, + { + "epoch": 1.0771513353115727, + "grad_norm": 0.5433124303817749, + "learning_rate": 1e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7676191329956055, + "num_tokens": 869925652.0, + "step": 1815 + }, + { + "epoch": 1.0777448071216618, + "grad_norm": 0.519880473613739, + "learning_rate": 1e-06, + "loss": 0.7701, + "mean_token_accuracy": 0.7572107315063477, + "num_tokens": 870441451.0, + "step": 1816 + }, + { + "epoch": 1.0783382789317508, + "grad_norm": 0.5507408976554871, + "learning_rate": 1e-06, + "loss": 0.7397, + "mean_token_accuracy": 0.7659656405448914, + "num_tokens": 870919482.0, + "step": 1817 + }, + { + "epoch": 1.0789317507418397, + "grad_norm": 0.562122642993927, + "learning_rate": 1e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7711524963378906, + "num_tokens": 871377324.0, + "step": 1818 + }, + { + "epoch": 1.0795252225519287, + "grad_norm": 0.5481792092323303, + "learning_rate": 1e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.752031147480011, + "num_tokens": 871812564.0, + "step": 1819 + }, + { + "epoch": 1.0801186943620178, + "grad_norm": 0.5755323767662048, + "learning_rate": 1e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7613013982772827, + "num_tokens": 872302293.0, + "step": 1820 + }, + { + "epoch": 1.0807121661721069, + "grad_norm": 0.5390602946281433, + "learning_rate": 1e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7770755290985107, + "num_tokens": 872799048.0, + "step": 1821 + }, + { + "epoch": 1.081305637982196, + "grad_norm": 0.5729861855506897, + "learning_rate": 1e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7630181908607483, + "num_tokens": 873250659.0, + "step": 1822 + }, + { + "epoch": 1.0818991097922848, + "grad_norm": 0.5795089602470398, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7672850489616394, + "num_tokens": 873741242.0, + "step": 1823 + }, + { + "epoch": 1.0824925816023738, + "grad_norm": 0.6260937452316284, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7570722103118896, + "num_tokens": 874219842.0, + "step": 1824 + }, + { + "epoch": 1.083086053412463, + "grad_norm": 0.5242007970809937, + "learning_rate": 1e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7715294361114502, + "num_tokens": 874709749.0, + "step": 1825 + }, + { + "epoch": 1.083679525222552, + "grad_norm": 0.5199064016342163, + "learning_rate": 1e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7746241092681885, + "num_tokens": 875185008.0, + "step": 1826 + }, + { + "epoch": 1.084272997032641, + "grad_norm": 0.5699592232704163, + "learning_rate": 1e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7694309949874878, + "num_tokens": 875662810.0, + "step": 1827 + }, + { + "epoch": 1.0848664688427299, + "grad_norm": 0.5954397916793823, + "learning_rate": 1e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7611771821975708, + "num_tokens": 876162594.0, + "step": 1828 + }, + { + "epoch": 1.085459940652819, + "grad_norm": 0.544957160949707, + "learning_rate": 1e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7549154758453369, + "num_tokens": 876643677.0, + "step": 1829 + }, + { + "epoch": 1.086053412462908, + "grad_norm": 0.5712368488311768, + "learning_rate": 1e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7580294609069824, + "num_tokens": 877126430.0, + "step": 1830 + }, + { + "epoch": 1.086646884272997, + "grad_norm": 0.5734604597091675, + "learning_rate": 1e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7632468342781067, + "num_tokens": 877639666.0, + "step": 1831 + }, + { + "epoch": 1.0872403560830861, + "grad_norm": 0.5926910042762756, + "learning_rate": 1e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7463303804397583, + "num_tokens": 878067784.0, + "step": 1832 + }, + { + "epoch": 1.0878338278931752, + "grad_norm": 0.5888997912406921, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7636121511459351, + "num_tokens": 878524127.0, + "step": 1833 + }, + { + "epoch": 1.088427299703264, + "grad_norm": 0.5540792346000671, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7634375095367432, + "num_tokens": 878996686.0, + "step": 1834 + }, + { + "epoch": 1.089020771513353, + "grad_norm": 0.5620121955871582, + "learning_rate": 1e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7550848722457886, + "num_tokens": 879440837.0, + "step": 1835 + }, + { + "epoch": 1.0896142433234421, + "grad_norm": 0.5870166420936584, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7704387903213501, + "num_tokens": 879932743.0, + "step": 1836 + }, + { + "epoch": 1.0902077151335312, + "grad_norm": 0.5648131966590881, + "learning_rate": 1e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7617811560630798, + "num_tokens": 880401953.0, + "step": 1837 + }, + { + "epoch": 1.0908011869436203, + "grad_norm": 0.5804789662361145, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7677266001701355, + "num_tokens": 880914152.0, + "step": 1838 + }, + { + "epoch": 1.0913946587537091, + "grad_norm": 0.5122698545455933, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7625046372413635, + "num_tokens": 881425034.0, + "step": 1839 + }, + { + "epoch": 1.0919881305637982, + "grad_norm": 0.5136278867721558, + "learning_rate": 1e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.7647697329521179, + "num_tokens": 881921161.0, + "step": 1840 + }, + { + "epoch": 1.0925816023738872, + "grad_norm": 0.5566068887710571, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7698475122451782, + "num_tokens": 882426226.0, + "step": 1841 + }, + { + "epoch": 1.0931750741839763, + "grad_norm": 0.6016142964363098, + "learning_rate": 1e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7804985046386719, + "num_tokens": 882878795.0, + "step": 1842 + }, + { + "epoch": 1.0937685459940654, + "grad_norm": 0.5149971842765808, + "learning_rate": 1e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7535876035690308, + "num_tokens": 883415543.0, + "step": 1843 + }, + { + "epoch": 1.0943620178041542, + "grad_norm": 0.5603625178337097, + "learning_rate": 1e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7694790363311768, + "num_tokens": 883880998.0, + "step": 1844 + }, + { + "epoch": 1.0949554896142433, + "grad_norm": 0.5497565269470215, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7688652276992798, + "num_tokens": 884341468.0, + "step": 1845 + }, + { + "epoch": 1.0955489614243323, + "grad_norm": 0.588202178478241, + "learning_rate": 1e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.7687346935272217, + "num_tokens": 884764132.0, + "step": 1846 + }, + { + "epoch": 1.0961424332344214, + "grad_norm": 0.5761446952819824, + "learning_rate": 1e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7594444751739502, + "num_tokens": 885229124.0, + "step": 1847 + }, + { + "epoch": 1.0967359050445105, + "grad_norm": 0.5647251009941101, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7757024765014648, + "num_tokens": 885693991.0, + "step": 1848 + }, + { + "epoch": 1.0973293768545993, + "grad_norm": 0.5362780094146729, + "learning_rate": 1e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7690554857254028, + "num_tokens": 886186440.0, + "step": 1849 + }, + { + "epoch": 1.0979228486646884, + "grad_norm": 0.5622255802154541, + "learning_rate": 1e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7611985206604004, + "num_tokens": 886665886.0, + "step": 1850 + }, + { + "epoch": 1.0985163204747774, + "grad_norm": 0.5421087741851807, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.750572144985199, + "num_tokens": 887151561.0, + "step": 1851 + }, + { + "epoch": 1.0991097922848665, + "grad_norm": 0.5485116839408875, + "learning_rate": 1e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.7547742128372192, + "num_tokens": 887629879.0, + "step": 1852 + }, + { + "epoch": 1.0997032640949556, + "grad_norm": 0.5182912349700928, + "learning_rate": 1e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.779700756072998, + "num_tokens": 888144845.0, + "step": 1853 + }, + { + "epoch": 1.1002967359050446, + "grad_norm": 0.5549550652503967, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7684076428413391, + "num_tokens": 888585096.0, + "step": 1854 + }, + { + "epoch": 1.1008902077151335, + "grad_norm": 0.5618695616722107, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7612997889518738, + "num_tokens": 889067356.0, + "step": 1855 + }, + { + "epoch": 1.1014836795252225, + "grad_norm": 0.535207211971283, + "learning_rate": 1e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7738476991653442, + "num_tokens": 889558292.0, + "step": 1856 + }, + { + "epoch": 1.1020771513353116, + "grad_norm": 0.5581971406936646, + "learning_rate": 1e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7639034986495972, + "num_tokens": 890076109.0, + "step": 1857 + }, + { + "epoch": 1.1026706231454007, + "grad_norm": 0.5330191850662231, + "learning_rate": 1e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.763910174369812, + "num_tokens": 890545099.0, + "step": 1858 + }, + { + "epoch": 1.1032640949554897, + "grad_norm": 0.5527345538139343, + "learning_rate": 1e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7633578777313232, + "num_tokens": 891059782.0, + "step": 1859 + }, + { + "epoch": 1.1038575667655786, + "grad_norm": 0.5417131185531616, + "learning_rate": 1e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.7650479674339294, + "num_tokens": 891529499.0, + "step": 1860 + }, + { + "epoch": 1.1044510385756676, + "grad_norm": 0.5511386394500732, + "learning_rate": 1e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7632503509521484, + "num_tokens": 891989328.0, + "step": 1861 + }, + { + "epoch": 1.1050445103857567, + "grad_norm": 0.573154091835022, + "learning_rate": 1e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.7706832885742188, + "num_tokens": 892452385.0, + "step": 1862 + }, + { + "epoch": 1.1056379821958457, + "grad_norm": 0.5283448696136475, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7708775401115417, + "num_tokens": 892955176.0, + "step": 1863 + }, + { + "epoch": 1.1062314540059348, + "grad_norm": 0.5498929023742676, + "learning_rate": 1e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7509324550628662, + "num_tokens": 893437450.0, + "step": 1864 + }, + { + "epoch": 1.1068249258160237, + "grad_norm": 0.5479654669761658, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7609373331069946, + "num_tokens": 893915903.0, + "step": 1865 + }, + { + "epoch": 1.1074183976261127, + "grad_norm": 0.5400394797325134, + "learning_rate": 1e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7602500319480896, + "num_tokens": 894379154.0, + "step": 1866 + }, + { + "epoch": 1.1080118694362018, + "grad_norm": 0.5331193208694458, + "learning_rate": 1e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7591657638549805, + "num_tokens": 894908169.0, + "step": 1867 + }, + { + "epoch": 1.1086053412462908, + "grad_norm": 0.5484079718589783, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7595980167388916, + "num_tokens": 895403094.0, + "step": 1868 + }, + { + "epoch": 1.10919881305638, + "grad_norm": 0.5248486399650574, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7736476063728333, + "num_tokens": 895898418.0, + "step": 1869 + }, + { + "epoch": 1.1097922848664687, + "grad_norm": 0.5374894738197327, + "learning_rate": 1e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7628072500228882, + "num_tokens": 896376486.0, + "step": 1870 + }, + { + "epoch": 1.1103857566765578, + "grad_norm": 0.5594019889831543, + "learning_rate": 1e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7613146305084229, + "num_tokens": 896817243.0, + "step": 1871 + }, + { + "epoch": 1.1109792284866469, + "grad_norm": 0.5534048080444336, + "learning_rate": 1e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7719892859458923, + "num_tokens": 897320451.0, + "step": 1872 + }, + { + "epoch": 1.111572700296736, + "grad_norm": 0.5354109406471252, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7673559188842773, + "num_tokens": 897846469.0, + "step": 1873 + }, + { + "epoch": 1.112166172106825, + "grad_norm": 0.539889931678772, + "learning_rate": 1e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.7746641635894775, + "num_tokens": 898308364.0, + "step": 1874 + }, + { + "epoch": 1.1127596439169138, + "grad_norm": 0.5384880900382996, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.766632616519928, + "num_tokens": 898794038.0, + "step": 1875 + }, + { + "epoch": 1.113353115727003, + "grad_norm": 0.5409849882125854, + "learning_rate": 1e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7701452970504761, + "num_tokens": 899281610.0, + "step": 1876 + }, + { + "epoch": 1.113946587537092, + "grad_norm": 0.530055820941925, + "learning_rate": 1e-06, + "loss": 0.7, + "mean_token_accuracy": 0.7793126106262207, + "num_tokens": 899780326.0, + "step": 1877 + }, + { + "epoch": 1.114540059347181, + "grad_norm": 0.6097345352172852, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7495412826538086, + "num_tokens": 900209133.0, + "step": 1878 + }, + { + "epoch": 1.11513353115727, + "grad_norm": 0.5691764950752258, + "learning_rate": 1e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7686443328857422, + "num_tokens": 900672097.0, + "step": 1879 + }, + { + "epoch": 1.115727002967359, + "grad_norm": 0.5161207318305969, + "learning_rate": 1e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7669107913970947, + "num_tokens": 901197930.0, + "step": 1880 + }, + { + "epoch": 1.116320474777448, + "grad_norm": 0.5909409523010254, + "learning_rate": 1e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7612706422805786, + "num_tokens": 901625410.0, + "step": 1881 + }, + { + "epoch": 1.116913946587537, + "grad_norm": 0.5346240997314453, + "learning_rate": 1e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7694755792617798, + "num_tokens": 902118502.0, + "step": 1882 + }, + { + "epoch": 1.1175074183976261, + "grad_norm": 0.5285117030143738, + "learning_rate": 1e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7727853059768677, + "num_tokens": 902610259.0, + "step": 1883 + }, + { + "epoch": 1.1181008902077152, + "grad_norm": 0.5292278528213501, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.766105592250824, + "num_tokens": 903097758.0, + "step": 1884 + }, + { + "epoch": 1.1186943620178043, + "grad_norm": 0.5254279971122742, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7586658000946045, + "num_tokens": 903597334.0, + "step": 1885 + }, + { + "epoch": 1.119287833827893, + "grad_norm": 0.5241813063621521, + "learning_rate": 1e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7679488658905029, + "num_tokens": 904085466.0, + "step": 1886 + }, + { + "epoch": 1.1198813056379822, + "grad_norm": 0.5388187766075134, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7696784734725952, + "num_tokens": 904584162.0, + "step": 1887 + }, + { + "epoch": 1.1204747774480712, + "grad_norm": 0.5493360161781311, + "learning_rate": 1e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7698071002960205, + "num_tokens": 905053082.0, + "step": 1888 + }, + { + "epoch": 1.1210682492581603, + "grad_norm": 0.55689537525177, + "learning_rate": 1e-06, + "loss": 0.759, + "mean_token_accuracy": 0.7621377110481262, + "num_tokens": 905515921.0, + "step": 1889 + }, + { + "epoch": 1.1216617210682494, + "grad_norm": 0.5482414364814758, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7655447721481323, + "num_tokens": 905952378.0, + "step": 1890 + }, + { + "epoch": 1.1222551928783382, + "grad_norm": 0.5623723268508911, + "learning_rate": 1e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7772297263145447, + "num_tokens": 906397689.0, + "step": 1891 + }, + { + "epoch": 1.1228486646884273, + "grad_norm": 0.5197610855102539, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7728326320648193, + "num_tokens": 906884799.0, + "step": 1892 + }, + { + "epoch": 1.1234421364985163, + "grad_norm": 0.5092893242835999, + "learning_rate": 1e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7653821706771851, + "num_tokens": 907389745.0, + "step": 1893 + }, + { + "epoch": 1.1240356083086054, + "grad_norm": 0.5616641640663147, + "learning_rate": 1e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7569501996040344, + "num_tokens": 907821894.0, + "step": 1894 + }, + { + "epoch": 1.1246290801186944, + "grad_norm": 0.5063769221305847, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7658209800720215, + "num_tokens": 908321317.0, + "step": 1895 + }, + { + "epoch": 1.1252225519287833, + "grad_norm": 0.5076869130134583, + "learning_rate": 1e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7664852738380432, + "num_tokens": 908832612.0, + "step": 1896 + }, + { + "epoch": 1.1258160237388724, + "grad_norm": 0.5193812251091003, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.756423830986023, + "num_tokens": 909329021.0, + "step": 1897 + }, + { + "epoch": 1.1264094955489614, + "grad_norm": 0.5427153706550598, + "learning_rate": 1e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.7546312212944031, + "num_tokens": 909815881.0, + "step": 1898 + }, + { + "epoch": 1.1270029673590505, + "grad_norm": 0.5754308104515076, + "learning_rate": 1e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7632409930229187, + "num_tokens": 910269564.0, + "step": 1899 + }, + { + "epoch": 1.1275964391691395, + "grad_norm": 0.5484165549278259, + "learning_rate": 1e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7507681846618652, + "num_tokens": 910726246.0, + "step": 1900 + }, + { + "epoch": 1.1281899109792284, + "grad_norm": 0.49977409839630127, + "learning_rate": 1e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7768839001655579, + "num_tokens": 911257256.0, + "step": 1901 + }, + { + "epoch": 1.1287833827893174, + "grad_norm": 0.521925687789917, + "learning_rate": 1e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.7580584287643433, + "num_tokens": 911778255.0, + "step": 1902 + }, + { + "epoch": 1.1293768545994065, + "grad_norm": 0.5375158786773682, + "learning_rate": 1e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7709599733352661, + "num_tokens": 912256176.0, + "step": 1903 + }, + { + "epoch": 1.1299703264094956, + "grad_norm": 0.5087738633155823, + "learning_rate": 1e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7792016267776489, + "num_tokens": 912763011.0, + "step": 1904 + }, + { + "epoch": 1.1305637982195846, + "grad_norm": 0.5698457956314087, + "learning_rate": 1e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.7669462561607361, + "num_tokens": 913184854.0, + "step": 1905 + }, + { + "epoch": 1.1311572700296737, + "grad_norm": 0.5178504586219788, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7731993794441223, + "num_tokens": 913699309.0, + "step": 1906 + }, + { + "epoch": 1.1317507418397625, + "grad_norm": 0.5408462285995483, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7692564725875854, + "num_tokens": 914175523.0, + "step": 1907 + }, + { + "epoch": 1.1323442136498516, + "grad_norm": 0.5776436924934387, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7620463967323303, + "num_tokens": 914689624.0, + "step": 1908 + }, + { + "epoch": 1.1329376854599407, + "grad_norm": 0.5658581852912903, + "learning_rate": 1e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.754776120185852, + "num_tokens": 915177815.0, + "step": 1909 + }, + { + "epoch": 1.1335311572700297, + "grad_norm": 0.542474627494812, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7674376964569092, + "num_tokens": 915670918.0, + "step": 1910 + }, + { + "epoch": 1.1341246290801186, + "grad_norm": 0.5837159752845764, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7750636339187622, + "num_tokens": 916156200.0, + "step": 1911 + }, + { + "epoch": 1.1347181008902076, + "grad_norm": 0.5790337324142456, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7680914998054504, + "num_tokens": 916646396.0, + "step": 1912 + }, + { + "epoch": 1.1353115727002967, + "grad_norm": 0.5483394265174866, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7694269418716431, + "num_tokens": 917165614.0, + "step": 1913 + }, + { + "epoch": 1.1359050445103858, + "grad_norm": 0.5630859732627869, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7683928608894348, + "num_tokens": 917679512.0, + "step": 1914 + }, + { + "epoch": 1.1364985163204748, + "grad_norm": 0.5350094437599182, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.764485239982605, + "num_tokens": 918188779.0, + "step": 1915 + }, + { + "epoch": 1.137091988130564, + "grad_norm": 0.5510642528533936, + "learning_rate": 1e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7579762935638428, + "num_tokens": 918702437.0, + "step": 1916 + }, + { + "epoch": 1.1376854599406527, + "grad_norm": 0.5718628168106079, + "learning_rate": 1e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.765468955039978, + "num_tokens": 919184402.0, + "step": 1917 + }, + { + "epoch": 1.1382789317507418, + "grad_norm": 0.5733482241630554, + "learning_rate": 1e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7797262668609619, + "num_tokens": 919613551.0, + "step": 1918 + }, + { + "epoch": 1.1388724035608309, + "grad_norm": 0.5658503174781799, + "learning_rate": 1e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7588855028152466, + "num_tokens": 920075356.0, + "step": 1919 + }, + { + "epoch": 1.13946587537092, + "grad_norm": 0.5966159105300903, + "learning_rate": 1e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.7629101276397705, + "num_tokens": 920576443.0, + "step": 1920 + }, + { + "epoch": 1.140059347181009, + "grad_norm": 0.5692886114120483, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7489324808120728, + "num_tokens": 921062401.0, + "step": 1921 + }, + { + "epoch": 1.1406528189910978, + "grad_norm": 0.5408810973167419, + "learning_rate": 1e-06, + "loss": 0.7547, + "mean_token_accuracy": 0.7589109539985657, + "num_tokens": 921554501.0, + "step": 1922 + }, + { + "epoch": 1.141246290801187, + "grad_norm": 0.608993411064148, + "learning_rate": 1e-06, + "loss": 0.7747, + "mean_token_accuracy": 0.7569684386253357, + "num_tokens": 921984754.0, + "step": 1923 + }, + { + "epoch": 1.141839762611276, + "grad_norm": 0.5600968599319458, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.771577000617981, + "num_tokens": 922486615.0, + "step": 1924 + }, + { + "epoch": 1.142433234421365, + "grad_norm": 0.5662102699279785, + "learning_rate": 1e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7549254298210144, + "num_tokens": 922923272.0, + "step": 1925 + }, + { + "epoch": 1.143026706231454, + "grad_norm": 0.5445786118507385, + "learning_rate": 1e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.7754770517349243, + "num_tokens": 923415844.0, + "step": 1926 + }, + { + "epoch": 1.1436201780415431, + "grad_norm": 0.5580638647079468, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7610098123550415, + "num_tokens": 923858023.0, + "step": 1927 + }, + { + "epoch": 1.144213649851632, + "grad_norm": 0.5245957374572754, + "learning_rate": 1e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.761542558670044, + "num_tokens": 924376765.0, + "step": 1928 + }, + { + "epoch": 1.144807121661721, + "grad_norm": 0.5368139147758484, + "learning_rate": 1e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7740228176116943, + "num_tokens": 924829282.0, + "step": 1929 + }, + { + "epoch": 1.1454005934718101, + "grad_norm": 0.5503162741661072, + "learning_rate": 1e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7575725317001343, + "num_tokens": 925283625.0, + "step": 1930 + }, + { + "epoch": 1.1459940652818992, + "grad_norm": 0.5370294451713562, + "learning_rate": 1e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7671447992324829, + "num_tokens": 925765179.0, + "step": 1931 + }, + { + "epoch": 1.146587537091988, + "grad_norm": 0.5047540068626404, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7608771324157715, + "num_tokens": 926323982.0, + "step": 1932 + }, + { + "epoch": 1.147181008902077, + "grad_norm": 0.5243087410926819, + "learning_rate": 1e-06, + "loss": 0.7803, + "mean_token_accuracy": 0.755022406578064, + "num_tokens": 926808982.0, + "step": 1933 + }, + { + "epoch": 1.1477744807121661, + "grad_norm": 0.5230445265769958, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7630066275596619, + "num_tokens": 927272967.0, + "step": 1934 + }, + { + "epoch": 1.1483679525222552, + "grad_norm": 0.5683405995368958, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.764813244342804, + "num_tokens": 927743309.0, + "step": 1935 + }, + { + "epoch": 1.1489614243323443, + "grad_norm": 0.5323536396026611, + "learning_rate": 1e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7744089365005493, + "num_tokens": 928185991.0, + "step": 1936 + }, + { + "epoch": 1.1495548961424333, + "grad_norm": 0.5300054550170898, + "learning_rate": 1e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7609185576438904, + "num_tokens": 928649640.0, + "step": 1937 + }, + { + "epoch": 1.1501483679525222, + "grad_norm": 0.5904362201690674, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7366514205932617, + "num_tokens": 929102371.0, + "step": 1938 + }, + { + "epoch": 1.1507418397626112, + "grad_norm": 0.5354209542274475, + "learning_rate": 1e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.75873202085495, + "num_tokens": 929558528.0, + "step": 1939 + }, + { + "epoch": 1.1513353115727003, + "grad_norm": 0.5771279335021973, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7638997435569763, + "num_tokens": 929988879.0, + "step": 1940 + }, + { + "epoch": 1.1519287833827894, + "grad_norm": 0.5575338006019592, + "learning_rate": 1e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7807775735855103, + "num_tokens": 930437174.0, + "step": 1941 + }, + { + "epoch": 1.1525222551928784, + "grad_norm": 0.5532331466674805, + "learning_rate": 1e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7632592916488647, + "num_tokens": 930893632.0, + "step": 1942 + }, + { + "epoch": 1.1531157270029673, + "grad_norm": 0.5276883244514465, + "learning_rate": 1e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.7552541494369507, + "num_tokens": 931384104.0, + "step": 1943 + }, + { + "epoch": 1.1537091988130563, + "grad_norm": 0.5620826482772827, + "learning_rate": 1e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7757344245910645, + "num_tokens": 931860695.0, + "step": 1944 + }, + { + "epoch": 1.1543026706231454, + "grad_norm": 0.5516224503517151, + "learning_rate": 1e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7689065337181091, + "num_tokens": 932332361.0, + "step": 1945 + }, + { + "epoch": 1.1548961424332345, + "grad_norm": 0.5251926183700562, + "learning_rate": 1e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.774112343788147, + "num_tokens": 932826390.0, + "step": 1946 + }, + { + "epoch": 1.1554896142433235, + "grad_norm": 0.5501478314399719, + "learning_rate": 1e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7726168632507324, + "num_tokens": 933301227.0, + "step": 1947 + }, + { + "epoch": 1.1560830860534124, + "grad_norm": 0.5559704899787903, + "learning_rate": 1e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7530713081359863, + "num_tokens": 933753049.0, + "step": 1948 + }, + { + "epoch": 1.1566765578635014, + "grad_norm": 0.5800051689147949, + "learning_rate": 1e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7547506093978882, + "num_tokens": 934218094.0, + "step": 1949 + }, + { + "epoch": 1.1572700296735905, + "grad_norm": 0.5353035926818848, + "learning_rate": 1e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7614477872848511, + "num_tokens": 934671396.0, + "step": 1950 + }, + { + "epoch": 1.1578635014836796, + "grad_norm": 0.5900796055793762, + "learning_rate": 1e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7599786520004272, + "num_tokens": 935073377.0, + "step": 1951 + }, + { + "epoch": 1.1584569732937686, + "grad_norm": 0.5633547306060791, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7665352821350098, + "num_tokens": 935514876.0, + "step": 1952 + }, + { + "epoch": 1.1590504451038575, + "grad_norm": 0.5357310771942139, + "learning_rate": 1e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7579750418663025, + "num_tokens": 936001262.0, + "step": 1953 + }, + { + "epoch": 1.1596439169139465, + "grad_norm": 0.5584630370140076, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7644327282905579, + "num_tokens": 936458973.0, + "step": 1954 + }, + { + "epoch": 1.1602373887240356, + "grad_norm": 0.5439273715019226, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.770051121711731, + "num_tokens": 936935675.0, + "step": 1955 + }, + { + "epoch": 1.1608308605341247, + "grad_norm": 0.5359981060028076, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7651562690734863, + "num_tokens": 937461913.0, + "step": 1956 + }, + { + "epoch": 1.1614243323442137, + "grad_norm": 0.5432302355766296, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7720264196395874, + "num_tokens": 937914671.0, + "step": 1957 + }, + { + "epoch": 1.1620178041543028, + "grad_norm": 0.5252879858016968, + "learning_rate": 1e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.758897066116333, + "num_tokens": 938405215.0, + "step": 1958 + }, + { + "epoch": 1.1626112759643916, + "grad_norm": 0.5440086722373962, + "learning_rate": 1e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7570885419845581, + "num_tokens": 938913428.0, + "step": 1959 + }, + { + "epoch": 1.1632047477744807, + "grad_norm": 0.5666568875312805, + "learning_rate": 1e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7565068006515503, + "num_tokens": 939351341.0, + "step": 1960 + }, + { + "epoch": 1.1637982195845697, + "grad_norm": 0.5314676761627197, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7727549076080322, + "num_tokens": 939844266.0, + "step": 1961 + }, + { + "epoch": 1.1643916913946588, + "grad_norm": 0.5591577291488647, + "learning_rate": 1e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.7652088403701782, + "num_tokens": 940311181.0, + "step": 1962 + }, + { + "epoch": 1.1649851632047477, + "grad_norm": 0.4949149489402771, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7703578472137451, + "num_tokens": 940854211.0, + "step": 1963 + }, + { + "epoch": 1.1655786350148367, + "grad_norm": 0.5362889170646667, + "learning_rate": 1e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.7764187455177307, + "num_tokens": 941354576.0, + "step": 1964 + }, + { + "epoch": 1.1661721068249258, + "grad_norm": 0.5462698340415955, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7613123059272766, + "num_tokens": 941840209.0, + "step": 1965 + }, + { + "epoch": 1.1667655786350148, + "grad_norm": 0.5508174300193787, + "learning_rate": 1e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7650104761123657, + "num_tokens": 942315605.0, + "step": 1966 + }, + { + "epoch": 1.167359050445104, + "grad_norm": 0.530014157295227, + "learning_rate": 1e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.7677146792411804, + "num_tokens": 942816460.0, + "step": 1967 + }, + { + "epoch": 1.167952522255193, + "grad_norm": 0.5173858404159546, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7596737146377563, + "num_tokens": 943295900.0, + "step": 1968 + }, + { + "epoch": 1.1685459940652818, + "grad_norm": 0.5186158418655396, + "learning_rate": 1e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7546815276145935, + "num_tokens": 943785991.0, + "step": 1969 + }, + { + "epoch": 1.1691394658753709, + "grad_norm": 0.511821448802948, + "learning_rate": 1e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7615547776222229, + "num_tokens": 944297423.0, + "step": 1970 + }, + { + "epoch": 1.16973293768546, + "grad_norm": 0.5337295532226562, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7680320739746094, + "num_tokens": 944761533.0, + "step": 1971 + }, + { + "epoch": 1.170326409495549, + "grad_norm": 0.5447278022766113, + "learning_rate": 1e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7722554206848145, + "num_tokens": 945237468.0, + "step": 1972 + }, + { + "epoch": 1.170919881305638, + "grad_norm": 0.5542596578598022, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7694679498672485, + "num_tokens": 945710548.0, + "step": 1973 + }, + { + "epoch": 1.171513353115727, + "grad_norm": 0.547292172908783, + "learning_rate": 1e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7759525179862976, + "num_tokens": 946176422.0, + "step": 1974 + }, + { + "epoch": 1.172106824925816, + "grad_norm": 0.5195297002792358, + "learning_rate": 1e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7730003595352173, + "num_tokens": 946658171.0, + "step": 1975 + }, + { + "epoch": 1.172700296735905, + "grad_norm": 0.5544541478157043, + "learning_rate": 1e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7659225463867188, + "num_tokens": 947119472.0, + "step": 1976 + }, + { + "epoch": 1.173293768545994, + "grad_norm": 0.5326763391494751, + "learning_rate": 1e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7446009516716003, + "num_tokens": 947623936.0, + "step": 1977 + }, + { + "epoch": 1.1738872403560832, + "grad_norm": 0.5408591032028198, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7574423551559448, + "num_tokens": 948147948.0, + "step": 1978 + }, + { + "epoch": 1.1744807121661722, + "grad_norm": 0.5327164530754089, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7688615918159485, + "num_tokens": 948610124.0, + "step": 1979 + }, + { + "epoch": 1.175074183976261, + "grad_norm": 0.5459703803062439, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.764666736125946, + "num_tokens": 949075541.0, + "step": 1980 + }, + { + "epoch": 1.1756676557863501, + "grad_norm": 0.5603657364845276, + "learning_rate": 1e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.7588883638381958, + "num_tokens": 949537338.0, + "step": 1981 + }, + { + "epoch": 1.1762611275964392, + "grad_norm": 0.5176706314086914, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7591500282287598, + "num_tokens": 950034437.0, + "step": 1982 + }, + { + "epoch": 1.1768545994065283, + "grad_norm": 0.5201418995857239, + "learning_rate": 1e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7604901790618896, + "num_tokens": 950530478.0, + "step": 1983 + }, + { + "epoch": 1.177448071216617, + "grad_norm": 0.5750729441642761, + "learning_rate": 1e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7619653344154358, + "num_tokens": 950972343.0, + "step": 1984 + }, + { + "epoch": 1.1780415430267062, + "grad_norm": 0.5340195894241333, + "learning_rate": 1e-06, + "loss": 0.775, + "mean_token_accuracy": 0.757735013961792, + "num_tokens": 951491836.0, + "step": 1985 + }, + { + "epoch": 1.1786350148367952, + "grad_norm": 0.5681485533714294, + "learning_rate": 1e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7468559145927429, + "num_tokens": 951960614.0, + "step": 1986 + }, + { + "epoch": 1.1792284866468843, + "grad_norm": 0.5601391792297363, + "learning_rate": 1e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7653756141662598, + "num_tokens": 952435243.0, + "step": 1987 + }, + { + "epoch": 1.1798219584569734, + "grad_norm": 0.5363548398017883, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7700982093811035, + "num_tokens": 952922237.0, + "step": 1988 + }, + { + "epoch": 1.1804154302670624, + "grad_norm": 0.5601548552513123, + "learning_rate": 1e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7603722214698792, + "num_tokens": 953364053.0, + "step": 1989 + }, + { + "epoch": 1.1810089020771513, + "grad_norm": 0.5447849631309509, + "learning_rate": 1e-06, + "loss": 0.7865, + "mean_token_accuracy": 0.7523285150527954, + "num_tokens": 953863359.0, + "step": 1990 + }, + { + "epoch": 1.1816023738872403, + "grad_norm": 0.5317331552505493, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7652119398117065, + "num_tokens": 954311264.0, + "step": 1991 + }, + { + "epoch": 1.1821958456973294, + "grad_norm": 0.557957112789154, + "learning_rate": 1e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7622448205947876, + "num_tokens": 954751650.0, + "step": 1992 + }, + { + "epoch": 1.1827893175074184, + "grad_norm": 0.5298891067504883, + "learning_rate": 1e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7584595680236816, + "num_tokens": 955249128.0, + "step": 1993 + }, + { + "epoch": 1.1833827893175075, + "grad_norm": 0.548530638217926, + "learning_rate": 1e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7620787620544434, + "num_tokens": 955731122.0, + "step": 1994 + }, + { + "epoch": 1.1839762611275964, + "grad_norm": 0.5217850804328918, + "learning_rate": 1e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7619261145591736, + "num_tokens": 956242301.0, + "step": 1995 + }, + { + "epoch": 1.1845697329376854, + "grad_norm": 0.5939652919769287, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7488205432891846, + "num_tokens": 956696186.0, + "step": 1996 + }, + { + "epoch": 1.1851632047477745, + "grad_norm": 0.5214646458625793, + "learning_rate": 1e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.76483154296875, + "num_tokens": 957208105.0, + "step": 1997 + }, + { + "epoch": 1.1857566765578635, + "grad_norm": 0.5534787774085999, + "learning_rate": 1e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7510899901390076, + "num_tokens": 957705173.0, + "step": 1998 + }, + { + "epoch": 1.1863501483679526, + "grad_norm": 0.5690791606903076, + "learning_rate": 1e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.75975102186203, + "num_tokens": 958153190.0, + "step": 1999 + }, + { + "epoch": 1.1869436201780414, + "grad_norm": 0.5462986826896667, + "learning_rate": 1e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7717565298080444, + "num_tokens": 958704922.0, + "step": 2000 + }, + { + "epoch": 1.1875370919881305, + "grad_norm": 0.5706959962844849, + "learning_rate": 1e-06, + "loss": 0.759, + "mean_token_accuracy": 0.7626659870147705, + "num_tokens": 959185734.0, + "step": 2001 + }, + { + "epoch": 1.1881305637982196, + "grad_norm": 0.5355578660964966, + "learning_rate": 1e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7774591445922852, + "num_tokens": 959750853.0, + "step": 2002 + }, + { + "epoch": 1.1887240356083086, + "grad_norm": 0.5452515482902527, + "learning_rate": 1e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7634910941123962, + "num_tokens": 960260968.0, + "step": 2003 + }, + { + "epoch": 1.1893175074183977, + "grad_norm": 0.5402987003326416, + "learning_rate": 1e-06, + "loss": 0.6494, + "mean_token_accuracy": 0.7911096811294556, + "num_tokens": 960752081.0, + "step": 2004 + }, + { + "epoch": 1.1899109792284865, + "grad_norm": 0.5452539324760437, + "learning_rate": 1e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7708073854446411, + "num_tokens": 961241356.0, + "step": 2005 + }, + { + "epoch": 1.1905044510385756, + "grad_norm": 0.5619888305664062, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7706578373908997, + "num_tokens": 961702888.0, + "step": 2006 + }, + { + "epoch": 1.1910979228486647, + "grad_norm": 0.5928375124931335, + "learning_rate": 1e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.7625043392181396, + "num_tokens": 962152262.0, + "step": 2007 + }, + { + "epoch": 1.1916913946587537, + "grad_norm": 0.5802929401397705, + "learning_rate": 1e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.7787374258041382, + "num_tokens": 962666898.0, + "step": 2008 + }, + { + "epoch": 1.1922848664688428, + "grad_norm": 0.5459344983100891, + "learning_rate": 1e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7521219253540039, + "num_tokens": 963145057.0, + "step": 2009 + }, + { + "epoch": 1.1928783382789319, + "grad_norm": 0.555898129940033, + "learning_rate": 1e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.768612802028656, + "num_tokens": 963594140.0, + "step": 2010 + }, + { + "epoch": 1.1934718100890207, + "grad_norm": 0.5738224387168884, + "learning_rate": 1e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7673864364624023, + "num_tokens": 964070990.0, + "step": 2011 + }, + { + "epoch": 1.1940652818991098, + "grad_norm": 0.6001997590065002, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7673776149749756, + "num_tokens": 964495736.0, + "step": 2012 + }, + { + "epoch": 1.1946587537091988, + "grad_norm": 0.564673662185669, + "learning_rate": 1e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.766791582107544, + "num_tokens": 964904834.0, + "step": 2013 + }, + { + "epoch": 1.195252225519288, + "grad_norm": 0.5473867654800415, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7652286291122437, + "num_tokens": 965389883.0, + "step": 2014 + }, + { + "epoch": 1.1958456973293767, + "grad_norm": 0.5241864323616028, + "learning_rate": 1e-06, + "loss": 0.6947, + "mean_token_accuracy": 0.7801815271377563, + "num_tokens": 965916125.0, + "step": 2015 + }, + { + "epoch": 1.1964391691394658, + "grad_norm": 0.5231907367706299, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7645518779754639, + "num_tokens": 966405366.0, + "step": 2016 + }, + { + "epoch": 1.1970326409495549, + "grad_norm": 0.53465336561203, + "learning_rate": 1e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.7684832811355591, + "num_tokens": 966898754.0, + "step": 2017 + }, + { + "epoch": 1.197626112759644, + "grad_norm": 0.5836277604103088, + "learning_rate": 1e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7728395462036133, + "num_tokens": 967403362.0, + "step": 2018 + }, + { + "epoch": 1.198219584569733, + "grad_norm": 0.5491542816162109, + "learning_rate": 1e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.754859447479248, + "num_tokens": 967904801.0, + "step": 2019 + }, + { + "epoch": 1.198813056379822, + "grad_norm": 0.5361668467521667, + "learning_rate": 1e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7478002905845642, + "num_tokens": 968391557.0, + "step": 2020 + }, + { + "epoch": 1.199406528189911, + "grad_norm": 0.5866718292236328, + "learning_rate": 1e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7500394582748413, + "num_tokens": 968803556.0, + "step": 2021 + }, + { + "epoch": 1.2, + "grad_norm": 0.5930951237678528, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7686105370521545, + "num_tokens": 969240773.0, + "step": 2022 + }, + { + "epoch": 1.200593471810089, + "grad_norm": 0.5126124024391174, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7647396922111511, + "num_tokens": 969762046.0, + "step": 2023 + }, + { + "epoch": 1.201186943620178, + "grad_norm": 0.543292760848999, + "learning_rate": 1e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7545582056045532, + "num_tokens": 970231075.0, + "step": 2024 + }, + { + "epoch": 1.2017804154302671, + "grad_norm": 0.5692458748817444, + "learning_rate": 1e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7701858282089233, + "num_tokens": 970737209.0, + "step": 2025 + }, + { + "epoch": 1.202373887240356, + "grad_norm": 0.5438128709793091, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7632119655609131, + "num_tokens": 971203386.0, + "step": 2026 + }, + { + "epoch": 1.202967359050445, + "grad_norm": 0.5378085374832153, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7628504037857056, + "num_tokens": 971734694.0, + "step": 2027 + }, + { + "epoch": 1.2035608308605341, + "grad_norm": 0.5588737726211548, + "learning_rate": 1e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7575427293777466, + "num_tokens": 972215362.0, + "step": 2028 + }, + { + "epoch": 1.2041543026706232, + "grad_norm": 0.5669310092926025, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7640529870986938, + "num_tokens": 972674598.0, + "step": 2029 + }, + { + "epoch": 1.2047477744807122, + "grad_norm": 0.5854511857032776, + "learning_rate": 1e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7691376805305481, + "num_tokens": 973175387.0, + "step": 2030 + }, + { + "epoch": 1.2053412462908013, + "grad_norm": 0.5984475612640381, + "learning_rate": 1e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7508634328842163, + "num_tokens": 973587029.0, + "step": 2031 + }, + { + "epoch": 1.2059347181008901, + "grad_norm": 0.5271645784378052, + "learning_rate": 1e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7522202730178833, + "num_tokens": 974100254.0, + "step": 2032 + }, + { + "epoch": 1.2065281899109792, + "grad_norm": 0.5923974514007568, + "learning_rate": 1e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7550257444381714, + "num_tokens": 974626469.0, + "step": 2033 + }, + { + "epoch": 1.2071216617210683, + "grad_norm": 0.5687121748924255, + "learning_rate": 1e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7698634266853333, + "num_tokens": 975081889.0, + "step": 2034 + }, + { + "epoch": 1.2077151335311573, + "grad_norm": 0.5419566631317139, + "learning_rate": 1e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.772646427154541, + "num_tokens": 975583542.0, + "step": 2035 + }, + { + "epoch": 1.2083086053412462, + "grad_norm": 0.5661171078681946, + "learning_rate": 1e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7744737863540649, + "num_tokens": 976015392.0, + "step": 2036 + }, + { + "epoch": 1.2089020771513352, + "grad_norm": 0.5899601578712463, + "learning_rate": 1e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7589015364646912, + "num_tokens": 976459432.0, + "step": 2037 + }, + { + "epoch": 1.2094955489614243, + "grad_norm": 0.5841950178146362, + "learning_rate": 1e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7656078338623047, + "num_tokens": 976961974.0, + "step": 2038 + }, + { + "epoch": 1.2100890207715134, + "grad_norm": 0.5531997084617615, + "learning_rate": 1e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7619933485984802, + "num_tokens": 977435121.0, + "step": 2039 + }, + { + "epoch": 1.2106824925816024, + "grad_norm": 0.5861453413963318, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7572795152664185, + "num_tokens": 977867064.0, + "step": 2040 + }, + { + "epoch": 1.2112759643916915, + "grad_norm": 0.5474546551704407, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7545146942138672, + "num_tokens": 978398745.0, + "step": 2041 + }, + { + "epoch": 1.2118694362017803, + "grad_norm": 0.5863576531410217, + "learning_rate": 1e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.7642678022384644, + "num_tokens": 978843862.0, + "step": 2042 + }, + { + "epoch": 1.2124629080118694, + "grad_norm": 0.5599773526191711, + "learning_rate": 1e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7676814794540405, + "num_tokens": 979321887.0, + "step": 2043 + }, + { + "epoch": 1.2130563798219585, + "grad_norm": 0.534605860710144, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7728476524353027, + "num_tokens": 979819270.0, + "step": 2044 + }, + { + "epoch": 1.2136498516320475, + "grad_norm": 0.5935091376304626, + "learning_rate": 1e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.7521229982376099, + "num_tokens": 980257785.0, + "step": 2045 + }, + { + "epoch": 1.2142433234421366, + "grad_norm": 0.5438976883888245, + "learning_rate": 1e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7787988781929016, + "num_tokens": 980707799.0, + "step": 2046 + }, + { + "epoch": 1.2148367952522254, + "grad_norm": 0.5503556728363037, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7585333585739136, + "num_tokens": 981210790.0, + "step": 2047 + }, + { + "epoch": 1.2154302670623145, + "grad_norm": 0.526187539100647, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7685278654098511, + "num_tokens": 981692506.0, + "step": 2048 + }, + { + "epoch": 1.2160237388724036, + "grad_norm": 0.5730617046356201, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7539865374565125, + "num_tokens": 982149506.0, + "step": 2049 + }, + { + "epoch": 1.2166172106824926, + "grad_norm": 0.5405805706977844, + "learning_rate": 1e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.7760963439941406, + "num_tokens": 982646178.0, + "step": 2050 + }, + { + "epoch": 1.2172106824925817, + "grad_norm": 0.551308274269104, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.76763916015625, + "num_tokens": 983107957.0, + "step": 2051 + }, + { + "epoch": 1.2178041543026705, + "grad_norm": 0.5382297039031982, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7473351955413818, + "num_tokens": 983610422.0, + "step": 2052 + }, + { + "epoch": 1.2183976261127596, + "grad_norm": 0.5875995755195618, + "learning_rate": 1e-06, + "loss": 0.7886, + "mean_token_accuracy": 0.7521665096282959, + "num_tokens": 984027649.0, + "step": 2053 + }, + { + "epoch": 1.2189910979228487, + "grad_norm": 0.561892569065094, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7644180059432983, + "num_tokens": 984489256.0, + "step": 2054 + }, + { + "epoch": 1.2195845697329377, + "grad_norm": 0.5094396471977234, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7646458148956299, + "num_tokens": 985034111.0, + "step": 2055 + }, + { + "epoch": 1.2201780415430268, + "grad_norm": 0.5553398132324219, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7695631980895996, + "num_tokens": 985493129.0, + "step": 2056 + }, + { + "epoch": 1.2207715133531156, + "grad_norm": 0.5412705540657043, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7634483575820923, + "num_tokens": 986025509.0, + "step": 2057 + }, + { + "epoch": 1.2213649851632047, + "grad_norm": 0.5702530741691589, + "learning_rate": 1e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.772391676902771, + "num_tokens": 986457676.0, + "step": 2058 + }, + { + "epoch": 1.2219584569732937, + "grad_norm": 0.5881693959236145, + "learning_rate": 1e-06, + "loss": 0.7498, + "mean_token_accuracy": 0.7626179456710815, + "num_tokens": 986897814.0, + "step": 2059 + }, + { + "epoch": 1.2225519287833828, + "grad_norm": 0.5310471653938293, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7618730068206787, + "num_tokens": 987372319.0, + "step": 2060 + }, + { + "epoch": 1.2231454005934719, + "grad_norm": 0.547607421875, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7630798816680908, + "num_tokens": 987837238.0, + "step": 2061 + }, + { + "epoch": 1.223738872403561, + "grad_norm": 0.529169499874115, + "learning_rate": 1e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7546572685241699, + "num_tokens": 988350683.0, + "step": 2062 + }, + { + "epoch": 1.2243323442136498, + "grad_norm": 0.5534629225730896, + "learning_rate": 1e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.7647873759269714, + "num_tokens": 988840357.0, + "step": 2063 + }, + { + "epoch": 1.2249258160237388, + "grad_norm": 0.5291078090667725, + "learning_rate": 1e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7663784027099609, + "num_tokens": 989332721.0, + "step": 2064 + }, + { + "epoch": 1.225519287833828, + "grad_norm": 0.5322517156600952, + "learning_rate": 1e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7682271599769592, + "num_tokens": 989857129.0, + "step": 2065 + }, + { + "epoch": 1.226112759643917, + "grad_norm": 0.5975133180618286, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7597773671150208, + "num_tokens": 990287575.0, + "step": 2066 + }, + { + "epoch": 1.2267062314540058, + "grad_norm": 0.5289385318756104, + "learning_rate": 1e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7638410329818726, + "num_tokens": 990819095.0, + "step": 2067 + }, + { + "epoch": 1.2272997032640949, + "grad_norm": 0.5426493883132935, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7678419947624207, + "num_tokens": 991274704.0, + "step": 2068 + }, + { + "epoch": 1.227893175074184, + "grad_norm": 0.5616716146469116, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7638320922851562, + "num_tokens": 991729748.0, + "step": 2069 + }, + { + "epoch": 1.228486646884273, + "grad_norm": 0.5808659791946411, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7529548406600952, + "num_tokens": 992167129.0, + "step": 2070 + }, + { + "epoch": 1.229080118694362, + "grad_norm": 0.5399619340896606, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.768484354019165, + "num_tokens": 992651314.0, + "step": 2071 + }, + { + "epoch": 1.2296735905044511, + "grad_norm": 0.5461021661758423, + "learning_rate": 1e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7709572315216064, + "num_tokens": 993132930.0, + "step": 2072 + }, + { + "epoch": 1.23026706231454, + "grad_norm": 0.5424829125404358, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7597959637641907, + "num_tokens": 993634675.0, + "step": 2073 + }, + { + "epoch": 1.230860534124629, + "grad_norm": 0.5328977704048157, + "learning_rate": 1e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7675884962081909, + "num_tokens": 994143864.0, + "step": 2074 + }, + { + "epoch": 1.231454005934718, + "grad_norm": 0.5828709006309509, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7561464905738831, + "num_tokens": 994623370.0, + "step": 2075 + }, + { + "epoch": 1.2320474777448072, + "grad_norm": 0.6001914143562317, + "learning_rate": 1e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7604150772094727, + "num_tokens": 995060614.0, + "step": 2076 + }, + { + "epoch": 1.2326409495548962, + "grad_norm": 0.5668458342552185, + "learning_rate": 1e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7773816585540771, + "num_tokens": 995524891.0, + "step": 2077 + }, + { + "epoch": 1.233234421364985, + "grad_norm": 0.5253818035125732, + "learning_rate": 1e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.7837690711021423, + "num_tokens": 996056105.0, + "step": 2078 + }, + { + "epoch": 1.2338278931750741, + "grad_norm": 0.5448928475379944, + "learning_rate": 1e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.7637147903442383, + "num_tokens": 996547852.0, + "step": 2079 + }, + { + "epoch": 1.2344213649851632, + "grad_norm": 0.5302894711494446, + "learning_rate": 1e-06, + "loss": 0.6394, + "mean_token_accuracy": 0.7964111566543579, + "num_tokens": 997047485.0, + "step": 2080 + }, + { + "epoch": 1.2350148367952523, + "grad_norm": 0.5684598088264465, + "learning_rate": 1e-06, + "loss": 0.6787, + "mean_token_accuracy": 0.7824355959892273, + "num_tokens": 997529136.0, + "step": 2081 + }, + { + "epoch": 1.2356083086053413, + "grad_norm": 0.5375885367393494, + "learning_rate": 1e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7668733596801758, + "num_tokens": 998043454.0, + "step": 2082 + }, + { + "epoch": 1.2362017804154304, + "grad_norm": 0.5289742946624756, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7657349109649658, + "num_tokens": 998539084.0, + "step": 2083 + }, + { + "epoch": 1.2367952522255192, + "grad_norm": 0.5348660945892334, + "learning_rate": 1e-06, + "loss": 0.6788, + "mean_token_accuracy": 0.7815845608711243, + "num_tokens": 999013705.0, + "step": 2084 + }, + { + "epoch": 1.2373887240356083, + "grad_norm": 0.5187511444091797, + "learning_rate": 1e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7753236293792725, + "num_tokens": 999513409.0, + "step": 2085 + }, + { + "epoch": 1.2379821958456974, + "grad_norm": 0.524324893951416, + "learning_rate": 1e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7675381302833557, + "num_tokens": 999996620.0, + "step": 2086 + }, + { + "epoch": 1.2385756676557864, + "grad_norm": 0.5420796275138855, + "learning_rate": 1e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7704500555992126, + "num_tokens": 1000496289.0, + "step": 2087 + }, + { + "epoch": 1.2391691394658753, + "grad_norm": 0.5079762935638428, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7664057016372681, + "num_tokens": 1001020839.0, + "step": 2088 + }, + { + "epoch": 1.2397626112759643, + "grad_norm": 0.5686794519424438, + "learning_rate": 1e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7748796939849854, + "num_tokens": 1001457244.0, + "step": 2089 + }, + { + "epoch": 1.2403560830860534, + "grad_norm": 0.6166486144065857, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7645804286003113, + "num_tokens": 1001943150.0, + "step": 2090 + }, + { + "epoch": 1.2409495548961424, + "grad_norm": 0.5462864637374878, + "learning_rate": 1e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7550771236419678, + "num_tokens": 1002430956.0, + "step": 2091 + }, + { + "epoch": 1.2415430267062315, + "grad_norm": 0.5162477493286133, + "learning_rate": 1e-06, + "loss": 0.7368, + "mean_token_accuracy": 0.76853346824646, + "num_tokens": 1002930372.0, + "step": 2092 + }, + { + "epoch": 1.2421364985163206, + "grad_norm": 0.5604137182235718, + "learning_rate": 1e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7732560038566589, + "num_tokens": 1003402222.0, + "step": 2093 + }, + { + "epoch": 1.2427299703264094, + "grad_norm": 0.5509542226791382, + "learning_rate": 1e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7722511887550354, + "num_tokens": 1003846234.0, + "step": 2094 + }, + { + "epoch": 1.2433234421364985, + "grad_norm": 0.5239876508712769, + "learning_rate": 1e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7564631700515747, + "num_tokens": 1004357820.0, + "step": 2095 + }, + { + "epoch": 1.2439169139465875, + "grad_norm": 0.571250319480896, + "learning_rate": 1e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.768382728099823, + "num_tokens": 1004783597.0, + "step": 2096 + }, + { + "epoch": 1.2445103857566766, + "grad_norm": 0.5864912867546082, + "learning_rate": 1e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7549557685852051, + "num_tokens": 1005269376.0, + "step": 2097 + }, + { + "epoch": 1.2451038575667657, + "grad_norm": 0.5561602711677551, + "learning_rate": 1e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7731680870056152, + "num_tokens": 1005766839.0, + "step": 2098 + }, + { + "epoch": 1.2456973293768545, + "grad_norm": 0.5653255581855774, + "learning_rate": 1e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7585240602493286, + "num_tokens": 1006194358.0, + "step": 2099 + }, + { + "epoch": 1.2462908011869436, + "grad_norm": 0.5275882482528687, + "learning_rate": 1e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7648134231567383, + "num_tokens": 1006693640.0, + "step": 2100 + }, + { + "epoch": 1.2468842729970326, + "grad_norm": 0.565430223941803, + "learning_rate": 1e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7635308504104614, + "num_tokens": 1007192461.0, + "step": 2101 + }, + { + "epoch": 1.2474777448071217, + "grad_norm": 0.5079957842826843, + "learning_rate": 1e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7646760940551758, + "num_tokens": 1007711814.0, + "step": 2102 + }, + { + "epoch": 1.2480712166172108, + "grad_norm": 0.5146920680999756, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7636078000068665, + "num_tokens": 1008191425.0, + "step": 2103 + }, + { + "epoch": 1.2486646884272996, + "grad_norm": 0.5269952416419983, + "learning_rate": 1e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.750403642654419, + "num_tokens": 1008702212.0, + "step": 2104 + }, + { + "epoch": 1.2492581602373887, + "grad_norm": 0.5153070688247681, + "learning_rate": 1e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7612813711166382, + "num_tokens": 1009205235.0, + "step": 2105 + }, + { + "epoch": 1.2498516320474777, + "grad_norm": 0.5118973255157471, + "learning_rate": 1e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7595269680023193, + "num_tokens": 1009734169.0, + "step": 2106 + }, + { + "epoch": 1.2504451038575668, + "grad_norm": 0.5883051156997681, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.762684166431427, + "num_tokens": 1010156676.0, + "step": 2107 + }, + { + "epoch": 1.2510385756676559, + "grad_norm": 0.5235896110534668, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7645427584648132, + "num_tokens": 1010657791.0, + "step": 2108 + }, + { + "epoch": 1.2516320474777447, + "grad_norm": 0.5674580335617065, + "learning_rate": 1e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7576176524162292, + "num_tokens": 1011125309.0, + "step": 2109 + }, + { + "epoch": 1.2522255192878338, + "grad_norm": 0.5356592535972595, + "learning_rate": 1e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.7735039591789246, + "num_tokens": 1011608990.0, + "step": 2110 + }, + { + "epoch": 1.2528189910979228, + "grad_norm": 0.5402013659477234, + "learning_rate": 1e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7549493312835693, + "num_tokens": 1012126090.0, + "step": 2111 + }, + { + "epoch": 1.253412462908012, + "grad_norm": 0.521581768989563, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7628651857376099, + "num_tokens": 1012631621.0, + "step": 2112 + }, + { + "epoch": 1.254005934718101, + "grad_norm": 0.5667610764503479, + "learning_rate": 1e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7775508165359497, + "num_tokens": 1013126325.0, + "step": 2113 + }, + { + "epoch": 1.25459940652819, + "grad_norm": 0.5697561502456665, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7710516452789307, + "num_tokens": 1013586605.0, + "step": 2114 + }, + { + "epoch": 1.2551928783382789, + "grad_norm": 0.5417295098304749, + "learning_rate": 1e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7707746624946594, + "num_tokens": 1014037477.0, + "step": 2115 + }, + { + "epoch": 1.255786350148368, + "grad_norm": 0.5282313227653503, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7601622939109802, + "num_tokens": 1014530536.0, + "step": 2116 + }, + { + "epoch": 1.256379821958457, + "grad_norm": 0.5551671981811523, + "learning_rate": 1e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7781908512115479, + "num_tokens": 1015017477.0, + "step": 2117 + }, + { + "epoch": 1.256973293768546, + "grad_norm": 0.5350055694580078, + "learning_rate": 1e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.763826310634613, + "num_tokens": 1015512746.0, + "step": 2118 + }, + { + "epoch": 1.257566765578635, + "grad_norm": 0.5923734903335571, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7671753168106079, + "num_tokens": 1015943350.0, + "step": 2119 + }, + { + "epoch": 1.258160237388724, + "grad_norm": 0.5528225302696228, + "learning_rate": 1e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7763077020645142, + "num_tokens": 1016396386.0, + "step": 2120 + }, + { + "epoch": 1.258753709198813, + "grad_norm": 0.5405589938163757, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7698240876197815, + "num_tokens": 1016873078.0, + "step": 2121 + }, + { + "epoch": 1.259347181008902, + "grad_norm": 0.5713402628898621, + "learning_rate": 1e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7675433158874512, + "num_tokens": 1017364051.0, + "step": 2122 + }, + { + "epoch": 1.2599406528189911, + "grad_norm": 0.541592538356781, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7604851126670837, + "num_tokens": 1017853086.0, + "step": 2123 + }, + { + "epoch": 1.2605341246290802, + "grad_norm": 0.5277333855628967, + "learning_rate": 1e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7675619125366211, + "num_tokens": 1018352716.0, + "step": 2124 + }, + { + "epoch": 1.2611275964391693, + "grad_norm": 0.5600378513336182, + "learning_rate": 1e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7749502658843994, + "num_tokens": 1018816875.0, + "step": 2125 + }, + { + "epoch": 1.2617210682492581, + "grad_norm": 0.5666190981864929, + "learning_rate": 1e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7738641500473022, + "num_tokens": 1019288940.0, + "step": 2126 + }, + { + "epoch": 1.2623145400593472, + "grad_norm": 0.5560542345046997, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7741755247116089, + "num_tokens": 1019761096.0, + "step": 2127 + }, + { + "epoch": 1.2629080118694362, + "grad_norm": 0.5452166199684143, + "learning_rate": 1e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7763643264770508, + "num_tokens": 1020243667.0, + "step": 2128 + }, + { + "epoch": 1.2635014836795253, + "grad_norm": 0.545129120349884, + "learning_rate": 1e-06, + "loss": 0.7809, + "mean_token_accuracy": 0.7554757595062256, + "num_tokens": 1020734968.0, + "step": 2129 + }, + { + "epoch": 1.2640949554896141, + "grad_norm": 0.5468602180480957, + "learning_rate": 1e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7572034597396851, + "num_tokens": 1021242778.0, + "step": 2130 + }, + { + "epoch": 1.2646884272997032, + "grad_norm": 0.5223185420036316, + "learning_rate": 1e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7675392627716064, + "num_tokens": 1021739385.0, + "step": 2131 + }, + { + "epoch": 1.2652818991097923, + "grad_norm": 0.5178122520446777, + "learning_rate": 1e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.764838695526123, + "num_tokens": 1022241594.0, + "step": 2132 + }, + { + "epoch": 1.2658753709198813, + "grad_norm": 0.558673083782196, + "learning_rate": 1e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7609381675720215, + "num_tokens": 1022725895.0, + "step": 2133 + }, + { + "epoch": 1.2664688427299704, + "grad_norm": 0.5594430565834045, + "learning_rate": 1e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7605140209197998, + "num_tokens": 1023199470.0, + "step": 2134 + }, + { + "epoch": 1.2670623145400595, + "grad_norm": 0.7386735677719116, + "learning_rate": 1e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.7729729413986206, + "num_tokens": 1023588367.0, + "step": 2135 + }, + { + "epoch": 1.2676557863501483, + "grad_norm": 0.5797968506813049, + "learning_rate": 1e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7715684175491333, + "num_tokens": 1024035017.0, + "step": 2136 + }, + { + "epoch": 1.2682492581602374, + "grad_norm": 0.5462418794631958, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7734567523002625, + "num_tokens": 1024504545.0, + "step": 2137 + }, + { + "epoch": 1.2688427299703264, + "grad_norm": 0.588708758354187, + "learning_rate": 1e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.77350914478302, + "num_tokens": 1024985602.0, + "step": 2138 + }, + { + "epoch": 1.2694362017804155, + "grad_norm": 0.573943018913269, + "learning_rate": 1e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7675918340682983, + "num_tokens": 1025439212.0, + "step": 2139 + }, + { + "epoch": 1.2700296735905043, + "grad_norm": 0.5507581830024719, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.755861759185791, + "num_tokens": 1025935982.0, + "step": 2140 + }, + { + "epoch": 1.2706231454005934, + "grad_norm": 0.5650292634963989, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7402902841567993, + "num_tokens": 1026409238.0, + "step": 2141 + }, + { + "epoch": 1.2712166172106825, + "grad_norm": 0.555990993976593, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7569810152053833, + "num_tokens": 1026917061.0, + "step": 2142 + }, + { + "epoch": 1.2718100890207715, + "grad_norm": 0.5299937129020691, + "learning_rate": 1e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7661173343658447, + "num_tokens": 1027427563.0, + "step": 2143 + }, + { + "epoch": 1.2724035608308606, + "grad_norm": 0.5543161630630493, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7605185508728027, + "num_tokens": 1027883950.0, + "step": 2144 + }, + { + "epoch": 1.2729970326409497, + "grad_norm": 0.5548838973045349, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7462213039398193, + "num_tokens": 1028380016.0, + "step": 2145 + }, + { + "epoch": 1.2735905044510385, + "grad_norm": 0.6059577465057373, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7611150741577148, + "num_tokens": 1028856267.0, + "step": 2146 + }, + { + "epoch": 1.2741839762611276, + "grad_norm": 0.6037055850028992, + "learning_rate": 1e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7561241388320923, + "num_tokens": 1029321506.0, + "step": 2147 + }, + { + "epoch": 1.2747774480712166, + "grad_norm": 0.5388346910476685, + "learning_rate": 1e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7646145820617676, + "num_tokens": 1029795091.0, + "step": 2148 + }, + { + "epoch": 1.2753709198813057, + "grad_norm": 0.5269613265991211, + "learning_rate": 1e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7704852819442749, + "num_tokens": 1030288346.0, + "step": 2149 + }, + { + "epoch": 1.2759643916913945, + "grad_norm": 0.6028329730033875, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7495710253715515, + "num_tokens": 1030757561.0, + "step": 2150 + }, + { + "epoch": 1.2765578635014836, + "grad_norm": 0.5485379695892334, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7659931182861328, + "num_tokens": 1031274450.0, + "step": 2151 + }, + { + "epoch": 1.2771513353115727, + "grad_norm": 0.5486400723457336, + "learning_rate": 1e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7781344652175903, + "num_tokens": 1031739921.0, + "step": 2152 + }, + { + "epoch": 1.2777448071216617, + "grad_norm": 0.5360390543937683, + "learning_rate": 1e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7617959976196289, + "num_tokens": 1032230920.0, + "step": 2153 + }, + { + "epoch": 1.2783382789317508, + "grad_norm": 0.5576122403144836, + "learning_rate": 1e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7562683820724487, + "num_tokens": 1032650685.0, + "step": 2154 + }, + { + "epoch": 1.2789317507418398, + "grad_norm": 0.5747937560081482, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7605315446853638, + "num_tokens": 1033109718.0, + "step": 2155 + }, + { + "epoch": 1.279525222551929, + "grad_norm": 0.5702918171882629, + "learning_rate": 1e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.769159734249115, + "num_tokens": 1033581769.0, + "step": 2156 + }, + { + "epoch": 1.2801186943620178, + "grad_norm": 0.5096608400344849, + "learning_rate": 1e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7724651098251343, + "num_tokens": 1034078401.0, + "step": 2157 + }, + { + "epoch": 1.2807121661721068, + "grad_norm": 0.5367279648780823, + "learning_rate": 1e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7721042037010193, + "num_tokens": 1034551374.0, + "step": 2158 + }, + { + "epoch": 1.2813056379821959, + "grad_norm": 0.5623910427093506, + "learning_rate": 1e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.758290708065033, + "num_tokens": 1035028239.0, + "step": 2159 + }, + { + "epoch": 1.281899109792285, + "grad_norm": 0.5468207001686096, + "learning_rate": 1e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7720587253570557, + "num_tokens": 1035531530.0, + "step": 2160 + }, + { + "epoch": 1.2824925816023738, + "grad_norm": 0.5477449297904968, + "learning_rate": 1e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7572972774505615, + "num_tokens": 1036010756.0, + "step": 2161 + }, + { + "epoch": 1.2830860534124628, + "grad_norm": 0.528963565826416, + "learning_rate": 1e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.763260006904602, + "num_tokens": 1036485981.0, + "step": 2162 + }, + { + "epoch": 1.283679525222552, + "grad_norm": 0.5566908717155457, + "learning_rate": 1e-06, + "loss": 0.764, + "mean_token_accuracy": 0.760441243648529, + "num_tokens": 1036934080.0, + "step": 2163 + }, + { + "epoch": 1.284272997032641, + "grad_norm": 0.5580323934555054, + "learning_rate": 1e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7564354538917542, + "num_tokens": 1037385353.0, + "step": 2164 + }, + { + "epoch": 1.28486646884273, + "grad_norm": 0.5559750199317932, + "learning_rate": 1e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.7697317600250244, + "num_tokens": 1037880560.0, + "step": 2165 + }, + { + "epoch": 1.285459940652819, + "grad_norm": 0.5560102462768555, + "learning_rate": 1e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.7479726076126099, + "num_tokens": 1038357892.0, + "step": 2166 + }, + { + "epoch": 1.286053412462908, + "grad_norm": 0.5224907994270325, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7680937051773071, + "num_tokens": 1038853989.0, + "step": 2167 + }, + { + "epoch": 1.286646884272997, + "grad_norm": 0.5227173566818237, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7667441368103027, + "num_tokens": 1039377259.0, + "step": 2168 + }, + { + "epoch": 1.287240356083086, + "grad_norm": 0.5383964776992798, + "learning_rate": 1e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7823541164398193, + "num_tokens": 1039856744.0, + "step": 2169 + }, + { + "epoch": 1.2878338278931751, + "grad_norm": 0.5341297388076782, + "learning_rate": 1e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7722039222717285, + "num_tokens": 1040326988.0, + "step": 2170 + }, + { + "epoch": 1.288427299703264, + "grad_norm": 0.5464023947715759, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7569794654846191, + "num_tokens": 1040818890.0, + "step": 2171 + }, + { + "epoch": 1.289020771513353, + "grad_norm": 0.5434159636497498, + "learning_rate": 1e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7627899646759033, + "num_tokens": 1041296533.0, + "step": 2172 + }, + { + "epoch": 1.289614243323442, + "grad_norm": 0.568114161491394, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7659815549850464, + "num_tokens": 1041715153.0, + "step": 2173 + }, + { + "epoch": 1.2902077151335312, + "grad_norm": 0.5715404748916626, + "learning_rate": 1e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7807163000106812, + "num_tokens": 1042210915.0, + "step": 2174 + }, + { + "epoch": 1.2908011869436202, + "grad_norm": 0.5290179252624512, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7651790976524353, + "num_tokens": 1042704100.0, + "step": 2175 + }, + { + "epoch": 1.2913946587537093, + "grad_norm": 0.5360859632492065, + "learning_rate": 1e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7503491640090942, + "num_tokens": 1043191898.0, + "step": 2176 + }, + { + "epoch": 1.2919881305637984, + "grad_norm": 0.5828554034233093, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7619668245315552, + "num_tokens": 1043636033.0, + "step": 2177 + }, + { + "epoch": 1.2925816023738872, + "grad_norm": 0.5033060312271118, + "learning_rate": 1e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.77232825756073, + "num_tokens": 1044177039.0, + "step": 2178 + }, + { + "epoch": 1.2931750741839763, + "grad_norm": 0.5666254162788391, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7604175806045532, + "num_tokens": 1044653312.0, + "step": 2179 + }, + { + "epoch": 1.2937685459940653, + "grad_norm": 0.5149867534637451, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7478717565536499, + "num_tokens": 1045172012.0, + "step": 2180 + }, + { + "epoch": 1.2943620178041544, + "grad_norm": 0.5516709685325623, + "learning_rate": 1e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7757676839828491, + "num_tokens": 1045653860.0, + "step": 2181 + }, + { + "epoch": 1.2949554896142432, + "grad_norm": 0.5743293762207031, + "learning_rate": 1e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7545536756515503, + "num_tokens": 1046112659.0, + "step": 2182 + }, + { + "epoch": 1.2955489614243323, + "grad_norm": 0.5053800344467163, + "learning_rate": 1e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7644383907318115, + "num_tokens": 1046623906.0, + "step": 2183 + }, + { + "epoch": 1.2961424332344214, + "grad_norm": 0.5079382061958313, + "learning_rate": 1e-06, + "loss": 0.6739, + "mean_token_accuracy": 0.7835446000099182, + "num_tokens": 1047104447.0, + "step": 2184 + }, + { + "epoch": 1.2967359050445104, + "grad_norm": 0.5246291160583496, + "learning_rate": 1e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7651474475860596, + "num_tokens": 1047606246.0, + "step": 2185 + }, + { + "epoch": 1.2973293768545995, + "grad_norm": 0.5212057828903198, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7598799467086792, + "num_tokens": 1048099939.0, + "step": 2186 + }, + { + "epoch": 1.2979228486646885, + "grad_norm": 0.5478268265724182, + "learning_rate": 1e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.7661172151565552, + "num_tokens": 1048581284.0, + "step": 2187 + }, + { + "epoch": 1.2985163204747774, + "grad_norm": 0.5209053158760071, + "learning_rate": 1e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.78053218126297, + "num_tokens": 1049068460.0, + "step": 2188 + }, + { + "epoch": 1.2991097922848664, + "grad_norm": 0.6017789840698242, + "learning_rate": 1e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.769344687461853, + "num_tokens": 1049510624.0, + "step": 2189 + }, + { + "epoch": 1.2997032640949555, + "grad_norm": 0.5401943922042847, + "learning_rate": 1e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7563010454177856, + "num_tokens": 1050022515.0, + "step": 2190 + }, + { + "epoch": 1.3002967359050446, + "grad_norm": 0.539160430431366, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7629064917564392, + "num_tokens": 1050497627.0, + "step": 2191 + }, + { + "epoch": 1.3008902077151334, + "grad_norm": 0.5493919253349304, + "learning_rate": 1e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.7589066624641418, + "num_tokens": 1050989802.0, + "step": 2192 + }, + { + "epoch": 1.3014836795252225, + "grad_norm": 0.5162265300750732, + "learning_rate": 1e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7630460858345032, + "num_tokens": 1051489338.0, + "step": 2193 + }, + { + "epoch": 1.3020771513353115, + "grad_norm": 0.5344685912132263, + "learning_rate": 1e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7803316116333008, + "num_tokens": 1051952563.0, + "step": 2194 + }, + { + "epoch": 1.3026706231454006, + "grad_norm": 0.5995444655418396, + "learning_rate": 1e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7549805641174316, + "num_tokens": 1052373628.0, + "step": 2195 + }, + { + "epoch": 1.3032640949554897, + "grad_norm": 0.5394517779350281, + "learning_rate": 1e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7558349967002869, + "num_tokens": 1052868330.0, + "step": 2196 + }, + { + "epoch": 1.3038575667655787, + "grad_norm": 0.5460432767868042, + "learning_rate": 1e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7789736986160278, + "num_tokens": 1053344154.0, + "step": 2197 + }, + { + "epoch": 1.3044510385756676, + "grad_norm": 0.5234236121177673, + "learning_rate": 1e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7678192257881165, + "num_tokens": 1053875384.0, + "step": 2198 + }, + { + "epoch": 1.3050445103857566, + "grad_norm": 0.5204014182090759, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7590580582618713, + "num_tokens": 1054362084.0, + "step": 2199 + }, + { + "epoch": 1.3056379821958457, + "grad_norm": 0.5430278778076172, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7674148082733154, + "num_tokens": 1054839580.0, + "step": 2200 + }, + { + "epoch": 1.3062314540059348, + "grad_norm": 0.5376337170600891, + "learning_rate": 1e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7672122716903687, + "num_tokens": 1055326982.0, + "step": 2201 + }, + { + "epoch": 1.3068249258160236, + "grad_norm": 0.5623001456260681, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.746695876121521, + "num_tokens": 1055779127.0, + "step": 2202 + }, + { + "epoch": 1.3074183976261127, + "grad_norm": 0.5633394718170166, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7527452707290649, + "num_tokens": 1056210782.0, + "step": 2203 + }, + { + "epoch": 1.3080118694362017, + "grad_norm": 0.5334067344665527, + "learning_rate": 1e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7698357105255127, + "num_tokens": 1056716681.0, + "step": 2204 + }, + { + "epoch": 1.3086053412462908, + "grad_norm": 0.5393185615539551, + "learning_rate": 1e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.7773759365081787, + "num_tokens": 1057185781.0, + "step": 2205 + }, + { + "epoch": 1.3091988130563799, + "grad_norm": 0.5731644034385681, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7489306330680847, + "num_tokens": 1057669707.0, + "step": 2206 + }, + { + "epoch": 1.309792284866469, + "grad_norm": 0.5979399085044861, + "learning_rate": 1e-06, + "loss": 0.6716, + "mean_token_accuracy": 0.7836766242980957, + "num_tokens": 1058090151.0, + "step": 2207 + }, + { + "epoch": 1.310385756676558, + "grad_norm": 0.5552513599395752, + "learning_rate": 1e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7579371929168701, + "num_tokens": 1058602457.0, + "step": 2208 + }, + { + "epoch": 1.3109792284866468, + "grad_norm": 0.5489526987075806, + "learning_rate": 1e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7675838470458984, + "num_tokens": 1059112709.0, + "step": 2209 + }, + { + "epoch": 1.311572700296736, + "grad_norm": 0.566586434841156, + "learning_rate": 1e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7744418382644653, + "num_tokens": 1059581068.0, + "step": 2210 + }, + { + "epoch": 1.312166172106825, + "grad_norm": 0.5730792284011841, + "learning_rate": 1e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7530911564826965, + "num_tokens": 1060034303.0, + "step": 2211 + }, + { + "epoch": 1.312759643916914, + "grad_norm": 0.5752981305122375, + "learning_rate": 1e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.753363847732544, + "num_tokens": 1060487376.0, + "step": 2212 + }, + { + "epoch": 1.3133531157270029, + "grad_norm": 0.548395037651062, + "learning_rate": 1e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7668778300285339, + "num_tokens": 1060958230.0, + "step": 2213 + }, + { + "epoch": 1.313946587537092, + "grad_norm": 0.5941423177719116, + "learning_rate": 1e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7500447630882263, + "num_tokens": 1061417764.0, + "step": 2214 + }, + { + "epoch": 1.314540059347181, + "grad_norm": 0.5962669849395752, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7577201128005981, + "num_tokens": 1061855968.0, + "step": 2215 + }, + { + "epoch": 1.31513353115727, + "grad_norm": 0.5676807165145874, + "learning_rate": 1e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7702702283859253, + "num_tokens": 1062324572.0, + "step": 2216 + }, + { + "epoch": 1.3157270029673591, + "grad_norm": 0.5418798327445984, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7675156593322754, + "num_tokens": 1062812143.0, + "step": 2217 + }, + { + "epoch": 1.3163204747774482, + "grad_norm": 0.5838621258735657, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7629098296165466, + "num_tokens": 1063284874.0, + "step": 2218 + }, + { + "epoch": 1.316913946587537, + "grad_norm": 0.5599802732467651, + "learning_rate": 1e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7514041066169739, + "num_tokens": 1063727509.0, + "step": 2219 + }, + { + "epoch": 1.317507418397626, + "grad_norm": 0.55466628074646, + "learning_rate": 1e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7706791162490845, + "num_tokens": 1064199609.0, + "step": 2220 + }, + { + "epoch": 1.3181008902077151, + "grad_norm": 0.546953022480011, + "learning_rate": 1e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.75600266456604, + "num_tokens": 1064658489.0, + "step": 2221 + }, + { + "epoch": 1.3186943620178042, + "grad_norm": 0.5340502858161926, + "learning_rate": 1e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7752215266227722, + "num_tokens": 1065142139.0, + "step": 2222 + }, + { + "epoch": 1.319287833827893, + "grad_norm": 0.541115939617157, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7659470438957214, + "num_tokens": 1065632583.0, + "step": 2223 + }, + { + "epoch": 1.3198813056379821, + "grad_norm": 0.5510992407798767, + "learning_rate": 1e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7506663799285889, + "num_tokens": 1066093200.0, + "step": 2224 + }, + { + "epoch": 1.3204747774480712, + "grad_norm": 0.5379236936569214, + "learning_rate": 1e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.765794038772583, + "num_tokens": 1066545379.0, + "step": 2225 + }, + { + "epoch": 1.3210682492581602, + "grad_norm": 0.5572365522384644, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7486675977706909, + "num_tokens": 1067010842.0, + "step": 2226 + }, + { + "epoch": 1.3216617210682493, + "grad_norm": 0.5823948383331299, + "learning_rate": 1e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7608121633529663, + "num_tokens": 1067456142.0, + "step": 2227 + }, + { + "epoch": 1.3222551928783384, + "grad_norm": 0.5540863275527954, + "learning_rate": 1e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7746708393096924, + "num_tokens": 1067911666.0, + "step": 2228 + }, + { + "epoch": 1.3228486646884274, + "grad_norm": 0.5421498417854309, + "learning_rate": 1e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.7852611541748047, + "num_tokens": 1068397231.0, + "step": 2229 + }, + { + "epoch": 1.3234421364985163, + "grad_norm": 0.5247494578361511, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7574572563171387, + "num_tokens": 1068906706.0, + "step": 2230 + }, + { + "epoch": 1.3240356083086053, + "grad_norm": 0.571764349937439, + "learning_rate": 1e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7578554153442383, + "num_tokens": 1069361890.0, + "step": 2231 + }, + { + "epoch": 1.3246290801186944, + "grad_norm": 0.5935285687446594, + "learning_rate": 1e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7556933164596558, + "num_tokens": 1069808615.0, + "step": 2232 + }, + { + "epoch": 1.3252225519287835, + "grad_norm": 0.572441816329956, + "learning_rate": 1e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.7561615705490112, + "num_tokens": 1070282977.0, + "step": 2233 + }, + { + "epoch": 1.3258160237388723, + "grad_norm": 0.5706073045730591, + "learning_rate": 1e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7598006725311279, + "num_tokens": 1070748059.0, + "step": 2234 + }, + { + "epoch": 1.3264094955489614, + "grad_norm": 0.5513944625854492, + "learning_rate": 1e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7542130351066589, + "num_tokens": 1071267418.0, + "step": 2235 + }, + { + "epoch": 1.3270029673590504, + "grad_norm": 0.5333744287490845, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7591381669044495, + "num_tokens": 1071756829.0, + "step": 2236 + }, + { + "epoch": 1.3275964391691395, + "grad_norm": 0.5328585505485535, + "learning_rate": 1e-06, + "loss": 0.6721, + "mean_token_accuracy": 0.7836701273918152, + "num_tokens": 1072252145.0, + "step": 2237 + }, + { + "epoch": 1.3281899109792286, + "grad_norm": 0.5376327633857727, + "learning_rate": 1e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.7763335108757019, + "num_tokens": 1072732241.0, + "step": 2238 + }, + { + "epoch": 1.3287833827893176, + "grad_norm": 0.5659723877906799, + "learning_rate": 1e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.7546194791793823, + "num_tokens": 1073230649.0, + "step": 2239 + }, + { + "epoch": 1.3293768545994065, + "grad_norm": 0.5438797473907471, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7544766068458557, + "num_tokens": 1073708797.0, + "step": 2240 + }, + { + "epoch": 1.3299703264094955, + "grad_norm": 0.611761748790741, + "learning_rate": 1e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7530417442321777, + "num_tokens": 1074217727.0, + "step": 2241 + }, + { + "epoch": 1.3305637982195846, + "grad_norm": 0.545676052570343, + "learning_rate": 1e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7738680839538574, + "num_tokens": 1074697265.0, + "step": 2242 + }, + { + "epoch": 1.3311572700296737, + "grad_norm": 0.5874198079109192, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7765493392944336, + "num_tokens": 1075136631.0, + "step": 2243 + }, + { + "epoch": 1.3317507418397625, + "grad_norm": 0.5677433609962463, + "learning_rate": 1e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7649548053741455, + "num_tokens": 1075623801.0, + "step": 2244 + }, + { + "epoch": 1.3323442136498516, + "grad_norm": 0.595953643321991, + "learning_rate": 1e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7597582340240479, + "num_tokens": 1076053991.0, + "step": 2245 + }, + { + "epoch": 1.3329376854599406, + "grad_norm": 0.551398754119873, + "learning_rate": 1e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7614244818687439, + "num_tokens": 1076519771.0, + "step": 2246 + }, + { + "epoch": 1.3335311572700297, + "grad_norm": 0.5775691866874695, + "learning_rate": 1e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7757008075714111, + "num_tokens": 1076958746.0, + "step": 2247 + }, + { + "epoch": 1.3341246290801188, + "grad_norm": 0.5524429678916931, + "learning_rate": 1e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7605655789375305, + "num_tokens": 1077431189.0, + "step": 2248 + }, + { + "epoch": 1.3347181008902078, + "grad_norm": 0.5634854435920715, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7645212411880493, + "num_tokens": 1077879655.0, + "step": 2249 + }, + { + "epoch": 1.3353115727002967, + "grad_norm": 0.5607907176017761, + "learning_rate": 1e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.756259560585022, + "num_tokens": 1078370837.0, + "step": 2250 + }, + { + "epoch": 1.3359050445103857, + "grad_norm": 0.6075378060340881, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7627131938934326, + "num_tokens": 1078825105.0, + "step": 2251 + }, + { + "epoch": 1.3364985163204748, + "grad_norm": 0.5523192286491394, + "learning_rate": 1e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7802307605743408, + "num_tokens": 1079285950.0, + "step": 2252 + }, + { + "epoch": 1.3370919881305638, + "grad_norm": 0.5453789830207825, + "learning_rate": 1e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7678914666175842, + "num_tokens": 1079756061.0, + "step": 2253 + }, + { + "epoch": 1.3376854599406527, + "grad_norm": 0.5424240827560425, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7601184844970703, + "num_tokens": 1080215994.0, + "step": 2254 + }, + { + "epoch": 1.3382789317507418, + "grad_norm": 0.5434086322784424, + "learning_rate": 1e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7796669602394104, + "num_tokens": 1080681127.0, + "step": 2255 + }, + { + "epoch": 1.3388724035608308, + "grad_norm": 0.5558045506477356, + "learning_rate": 1e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7608225345611572, + "num_tokens": 1081180698.0, + "step": 2256 + }, + { + "epoch": 1.3394658753709199, + "grad_norm": 0.5226301550865173, + "learning_rate": 1e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7692736387252808, + "num_tokens": 1081671110.0, + "step": 2257 + }, + { + "epoch": 1.340059347181009, + "grad_norm": 0.5734464526176453, + "learning_rate": 1e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7599601745605469, + "num_tokens": 1082102091.0, + "step": 2258 + }, + { + "epoch": 1.340652818991098, + "grad_norm": 0.5556466579437256, + "learning_rate": 1e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7703027725219727, + "num_tokens": 1082586736.0, + "step": 2259 + }, + { + "epoch": 1.341246290801187, + "grad_norm": 0.5184152126312256, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7505155801773071, + "num_tokens": 1083090113.0, + "step": 2260 + }, + { + "epoch": 1.341839762611276, + "grad_norm": 0.5145533680915833, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7767257690429688, + "num_tokens": 1083571140.0, + "step": 2261 + }, + { + "epoch": 1.342433234421365, + "grad_norm": 0.5217025279998779, + "learning_rate": 1e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.774789571762085, + "num_tokens": 1084077984.0, + "step": 2262 + }, + { + "epoch": 1.343026706231454, + "grad_norm": 0.5520344972610474, + "learning_rate": 1e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.7838614583015442, + "num_tokens": 1084533905.0, + "step": 2263 + }, + { + "epoch": 1.343620178041543, + "grad_norm": 0.5015196800231934, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7583225965499878, + "num_tokens": 1085047925.0, + "step": 2264 + }, + { + "epoch": 1.344213649851632, + "grad_norm": 0.5392878651618958, + "learning_rate": 1e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7687020897865295, + "num_tokens": 1085560615.0, + "step": 2265 + }, + { + "epoch": 1.344807121661721, + "grad_norm": 0.5777778625488281, + "learning_rate": 1e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.7516963481903076, + "num_tokens": 1086041745.0, + "step": 2266 + }, + { + "epoch": 1.34540059347181, + "grad_norm": 0.5647364854812622, + "learning_rate": 1e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7695757746696472, + "num_tokens": 1086534790.0, + "step": 2267 + }, + { + "epoch": 1.3459940652818991, + "grad_norm": 0.5171706080436707, + "learning_rate": 1e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7765026092529297, + "num_tokens": 1087032743.0, + "step": 2268 + }, + { + "epoch": 1.3465875370919882, + "grad_norm": 0.5866702795028687, + "learning_rate": 1e-06, + "loss": 0.767, + "mean_token_accuracy": 0.7592345476150513, + "num_tokens": 1087471871.0, + "step": 2269 + }, + { + "epoch": 1.3471810089020773, + "grad_norm": 0.557554304599762, + "learning_rate": 1e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.755020260810852, + "num_tokens": 1087929752.0, + "step": 2270 + }, + { + "epoch": 1.347774480712166, + "grad_norm": 0.5484732985496521, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7665050029754639, + "num_tokens": 1088430735.0, + "step": 2271 + }, + { + "epoch": 1.3483679525222552, + "grad_norm": 0.5575672388076782, + "learning_rate": 1e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7654843926429749, + "num_tokens": 1088871547.0, + "step": 2272 + }, + { + "epoch": 1.3489614243323442, + "grad_norm": 0.5330764651298523, + "learning_rate": 1e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7732694149017334, + "num_tokens": 1089378527.0, + "step": 2273 + }, + { + "epoch": 1.3495548961424333, + "grad_norm": 0.5474919080734253, + "learning_rate": 1e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7585288882255554, + "num_tokens": 1089832398.0, + "step": 2274 + }, + { + "epoch": 1.3501483679525221, + "grad_norm": 0.553622841835022, + "learning_rate": 1e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7575185298919678, + "num_tokens": 1090311839.0, + "step": 2275 + }, + { + "epoch": 1.3507418397626112, + "grad_norm": 0.5454913973808289, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7687637805938721, + "num_tokens": 1090750074.0, + "step": 2276 + }, + { + "epoch": 1.3513353115727003, + "grad_norm": 0.5441092848777771, + "learning_rate": 1e-06, + "loss": 0.781, + "mean_token_accuracy": 0.755222737789154, + "num_tokens": 1091240340.0, + "step": 2277 + }, + { + "epoch": 1.3519287833827893, + "grad_norm": 0.5415164828300476, + "learning_rate": 1e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7457724213600159, + "num_tokens": 1091745918.0, + "step": 2278 + }, + { + "epoch": 1.3525222551928784, + "grad_norm": 0.5272310972213745, + "learning_rate": 1e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.7758784294128418, + "num_tokens": 1092228990.0, + "step": 2279 + }, + { + "epoch": 1.3531157270029674, + "grad_norm": 0.5446674227714539, + "learning_rate": 1e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7670806646347046, + "num_tokens": 1092721916.0, + "step": 2280 + }, + { + "epoch": 1.3537091988130565, + "grad_norm": 0.5249865651130676, + "learning_rate": 1e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7615636587142944, + "num_tokens": 1093227960.0, + "step": 2281 + }, + { + "epoch": 1.3543026706231454, + "grad_norm": 0.5390671491622925, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7476259469985962, + "num_tokens": 1093719873.0, + "step": 2282 + }, + { + "epoch": 1.3548961424332344, + "grad_norm": 0.5264233946800232, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7711226344108582, + "num_tokens": 1094179417.0, + "step": 2283 + }, + { + "epoch": 1.3554896142433235, + "grad_norm": 0.5274903178215027, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7739017009735107, + "num_tokens": 1094668564.0, + "step": 2284 + }, + { + "epoch": 1.3560830860534125, + "grad_norm": 0.5207279324531555, + "learning_rate": 1e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7634141445159912, + "num_tokens": 1095169197.0, + "step": 2285 + }, + { + "epoch": 1.3566765578635014, + "grad_norm": 0.5399512052536011, + "learning_rate": 1e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7706307172775269, + "num_tokens": 1095640555.0, + "step": 2286 + }, + { + "epoch": 1.3572700296735905, + "grad_norm": 0.5696055293083191, + "learning_rate": 1e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.7731963396072388, + "num_tokens": 1096062317.0, + "step": 2287 + }, + { + "epoch": 1.3578635014836795, + "grad_norm": 0.5279783010482788, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7666580080986023, + "num_tokens": 1096568949.0, + "step": 2288 + }, + { + "epoch": 1.3584569732937686, + "grad_norm": 0.5340074896812439, + "learning_rate": 1e-06, + "loss": 0.6799, + "mean_token_accuracy": 0.782701849937439, + "num_tokens": 1097033144.0, + "step": 2289 + }, + { + "epoch": 1.3590504451038576, + "grad_norm": 0.5046202540397644, + "learning_rate": 1e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7753101587295532, + "num_tokens": 1097518619.0, + "step": 2290 + }, + { + "epoch": 1.3596439169139467, + "grad_norm": 0.5630753636360168, + "learning_rate": 1e-06, + "loss": 0.7303, + "mean_token_accuracy": 0.7685317993164062, + "num_tokens": 1097989190.0, + "step": 2291 + }, + { + "epoch": 1.3602373887240355, + "grad_norm": 0.5529530644416809, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7649713158607483, + "num_tokens": 1098488300.0, + "step": 2292 + }, + { + "epoch": 1.3608308605341246, + "grad_norm": 0.5730669498443604, + "learning_rate": 1e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7666778564453125, + "num_tokens": 1098933572.0, + "step": 2293 + }, + { + "epoch": 1.3614243323442137, + "grad_norm": 0.5433244109153748, + "learning_rate": 1e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7734858989715576, + "num_tokens": 1099429048.0, + "step": 2294 + }, + { + "epoch": 1.3620178041543027, + "grad_norm": 0.5407789349555969, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7748048901557922, + "num_tokens": 1099906300.0, + "step": 2295 + }, + { + "epoch": 1.3626112759643916, + "grad_norm": 0.55284184217453, + "learning_rate": 1e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7670329213142395, + "num_tokens": 1100391787.0, + "step": 2296 + }, + { + "epoch": 1.3632047477744806, + "grad_norm": 0.5582472085952759, + "learning_rate": 1e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.765668511390686, + "num_tokens": 1100847527.0, + "step": 2297 + }, + { + "epoch": 1.3637982195845697, + "grad_norm": 0.5075766444206238, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.771483302116394, + "num_tokens": 1101432044.0, + "step": 2298 + }, + { + "epoch": 1.3643916913946588, + "grad_norm": 0.5543628334999084, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7711046934127808, + "num_tokens": 1101944931.0, + "step": 2299 + }, + { + "epoch": 1.3649851632047478, + "grad_norm": 0.5533976554870605, + "learning_rate": 1e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7635371685028076, + "num_tokens": 1102435241.0, + "step": 2300 + }, + { + "epoch": 1.365578635014837, + "grad_norm": 0.5620860457420349, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7609900236129761, + "num_tokens": 1102902756.0, + "step": 2301 + }, + { + "epoch": 1.3661721068249257, + "grad_norm": 0.5711389780044556, + "learning_rate": 1e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7574135065078735, + "num_tokens": 1103355656.0, + "step": 2302 + }, + { + "epoch": 1.3667655786350148, + "grad_norm": 0.5534954071044922, + "learning_rate": 1e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.7564020156860352, + "num_tokens": 1103817942.0, + "step": 2303 + }, + { + "epoch": 1.3673590504451039, + "grad_norm": 0.5579046607017517, + "learning_rate": 1e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7786123156547546, + "num_tokens": 1104316718.0, + "step": 2304 + }, + { + "epoch": 1.367952522255193, + "grad_norm": 0.5941896438598633, + "learning_rate": 1e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7657291889190674, + "num_tokens": 1104792894.0, + "step": 2305 + }, + { + "epoch": 1.3685459940652818, + "grad_norm": 0.5544993877410889, + "learning_rate": 1e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.76317298412323, + "num_tokens": 1105291660.0, + "step": 2306 + }, + { + "epoch": 1.3691394658753708, + "grad_norm": 0.5538344383239746, + "learning_rate": 1e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7684952616691589, + "num_tokens": 1105742366.0, + "step": 2307 + }, + { + "epoch": 1.36973293768546, + "grad_norm": 0.527181088924408, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7550569772720337, + "num_tokens": 1106269715.0, + "step": 2308 + }, + { + "epoch": 1.370326409495549, + "grad_norm": 0.526197075843811, + "learning_rate": 1e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.7595322132110596, + "num_tokens": 1106781724.0, + "step": 2309 + }, + { + "epoch": 1.370919881305638, + "grad_norm": 0.5641116499900818, + "learning_rate": 1e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7682652473449707, + "num_tokens": 1107254605.0, + "step": 2310 + }, + { + "epoch": 1.371513353115727, + "grad_norm": 0.5368207693099976, + "learning_rate": 1e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7614284157752991, + "num_tokens": 1107754287.0, + "step": 2311 + }, + { + "epoch": 1.3721068249258161, + "grad_norm": 0.5013571381568909, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7647089958190918, + "num_tokens": 1108299462.0, + "step": 2312 + }, + { + "epoch": 1.372700296735905, + "grad_norm": 0.544499397277832, + "learning_rate": 1e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7485023736953735, + "num_tokens": 1108780900.0, + "step": 2313 + }, + { + "epoch": 1.373293768545994, + "grad_norm": 0.5403860807418823, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7714452147483826, + "num_tokens": 1109271250.0, + "step": 2314 + }, + { + "epoch": 1.3738872403560831, + "grad_norm": 0.5390545129776001, + "learning_rate": 1e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7677338123321533, + "num_tokens": 1109754591.0, + "step": 2315 + }, + { + "epoch": 1.3744807121661722, + "grad_norm": 0.5414617657661438, + "learning_rate": 1e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7673539519309998, + "num_tokens": 1110232314.0, + "step": 2316 + }, + { + "epoch": 1.375074183976261, + "grad_norm": 0.5510480403900146, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7662419080734253, + "num_tokens": 1110702549.0, + "step": 2317 + }, + { + "epoch": 1.37566765578635, + "grad_norm": 0.5582880973815918, + "learning_rate": 1e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7763093113899231, + "num_tokens": 1111147985.0, + "step": 2318 + }, + { + "epoch": 1.3762611275964391, + "grad_norm": 0.5422030687332153, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7650824785232544, + "num_tokens": 1111628681.0, + "step": 2319 + }, + { + "epoch": 1.3768545994065282, + "grad_norm": 0.5359447598457336, + "learning_rate": 1e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7793112397193909, + "num_tokens": 1112114989.0, + "step": 2320 + }, + { + "epoch": 1.3774480712166173, + "grad_norm": 0.5688968896865845, + "learning_rate": 1e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7674528956413269, + "num_tokens": 1112535770.0, + "step": 2321 + }, + { + "epoch": 1.3780415430267063, + "grad_norm": 0.5576720833778381, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7681459188461304, + "num_tokens": 1112990308.0, + "step": 2322 + }, + { + "epoch": 1.3786350148367952, + "grad_norm": 0.5237002968788147, + "learning_rate": 1e-06, + "loss": 0.7262, + "mean_token_accuracy": 0.7698178291320801, + "num_tokens": 1113504240.0, + "step": 2323 + }, + { + "epoch": 1.3792284866468842, + "grad_norm": 0.5343592762947083, + "learning_rate": 1e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7568579316139221, + "num_tokens": 1113996486.0, + "step": 2324 + }, + { + "epoch": 1.3798219584569733, + "grad_norm": 0.5384488105773926, + "learning_rate": 1e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.7737101912498474, + "num_tokens": 1114478737.0, + "step": 2325 + }, + { + "epoch": 1.3804154302670624, + "grad_norm": 0.5379281044006348, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.766589879989624, + "num_tokens": 1114957021.0, + "step": 2326 + }, + { + "epoch": 1.3810089020771512, + "grad_norm": 0.5257212519645691, + "learning_rate": 1e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7603199481964111, + "num_tokens": 1115445047.0, + "step": 2327 + }, + { + "epoch": 1.3816023738872403, + "grad_norm": 0.5356423854827881, + "learning_rate": 1e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7684131860733032, + "num_tokens": 1115921889.0, + "step": 2328 + }, + { + "epoch": 1.3821958456973293, + "grad_norm": 0.5269123911857605, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7587425708770752, + "num_tokens": 1116455733.0, + "step": 2329 + }, + { + "epoch": 1.3827893175074184, + "grad_norm": 0.5527017712593079, + "learning_rate": 1e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7643134593963623, + "num_tokens": 1116937620.0, + "step": 2330 + }, + { + "epoch": 1.3833827893175075, + "grad_norm": 0.600703775882721, + "learning_rate": 1e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7499996423721313, + "num_tokens": 1117394727.0, + "step": 2331 + }, + { + "epoch": 1.3839762611275965, + "grad_norm": 0.5151260495185852, + "learning_rate": 1e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7630317807197571, + "num_tokens": 1117921270.0, + "step": 2332 + }, + { + "epoch": 1.3845697329376856, + "grad_norm": 0.49598002433776855, + "learning_rate": 1e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.767066478729248, + "num_tokens": 1118462911.0, + "step": 2333 + }, + { + "epoch": 1.3851632047477744, + "grad_norm": 0.5169249773025513, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7596725821495056, + "num_tokens": 1119002050.0, + "step": 2334 + }, + { + "epoch": 1.3857566765578635, + "grad_norm": 0.5703950524330139, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7567805051803589, + "num_tokens": 1119457088.0, + "step": 2335 + }, + { + "epoch": 1.3863501483679526, + "grad_norm": 0.5347667932510376, + "learning_rate": 1e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7549117803573608, + "num_tokens": 1120028725.0, + "step": 2336 + }, + { + "epoch": 1.3869436201780416, + "grad_norm": 0.5528847575187683, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.763746976852417, + "num_tokens": 1120477646.0, + "step": 2337 + }, + { + "epoch": 1.3875370919881305, + "grad_norm": 0.5623385906219482, + "learning_rate": 1e-06, + "loss": 0.7, + "mean_token_accuracy": 0.7761179804801941, + "num_tokens": 1120906088.0, + "step": 2338 + }, + { + "epoch": 1.3881305637982195, + "grad_norm": 0.566204845905304, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.776658296585083, + "num_tokens": 1121387630.0, + "step": 2339 + }, + { + "epoch": 1.3887240356083086, + "grad_norm": 0.5964410305023193, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7621327638626099, + "num_tokens": 1121855475.0, + "step": 2340 + }, + { + "epoch": 1.3893175074183977, + "grad_norm": 0.565897524356842, + "learning_rate": 1e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7561368942260742, + "num_tokens": 1122301138.0, + "step": 2341 + }, + { + "epoch": 1.3899109792284867, + "grad_norm": 0.5677763819694519, + "learning_rate": 1e-06, + "loss": 0.7262, + "mean_token_accuracy": 0.770050048828125, + "num_tokens": 1122768303.0, + "step": 2342 + }, + { + "epoch": 1.3905044510385758, + "grad_norm": 0.6022994518280029, + "learning_rate": 1e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7627661228179932, + "num_tokens": 1123205411.0, + "step": 2343 + }, + { + "epoch": 1.3910979228486646, + "grad_norm": 0.5572198033332825, + "learning_rate": 1e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7522197961807251, + "num_tokens": 1123699272.0, + "step": 2344 + }, + { + "epoch": 1.3916913946587537, + "grad_norm": 0.522746741771698, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7696568369865417, + "num_tokens": 1124194199.0, + "step": 2345 + }, + { + "epoch": 1.3922848664688428, + "grad_norm": 0.5465329885482788, + "learning_rate": 1e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7719135880470276, + "num_tokens": 1124653662.0, + "step": 2346 + }, + { + "epoch": 1.3928783382789318, + "grad_norm": 0.5825669169425964, + "learning_rate": 1e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7655305862426758, + "num_tokens": 1125118166.0, + "step": 2347 + }, + { + "epoch": 1.3934718100890207, + "grad_norm": 0.5561555027961731, + "learning_rate": 1e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.765897274017334, + "num_tokens": 1125602522.0, + "step": 2348 + }, + { + "epoch": 1.3940652818991097, + "grad_norm": 0.5090498328208923, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7686542272567749, + "num_tokens": 1126125371.0, + "step": 2349 + }, + { + "epoch": 1.3946587537091988, + "grad_norm": 0.538944661617279, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7590726613998413, + "num_tokens": 1126593309.0, + "step": 2350 + }, + { + "epoch": 1.3952522255192878, + "grad_norm": 0.5981987118721008, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7533155083656311, + "num_tokens": 1127051461.0, + "step": 2351 + }, + { + "epoch": 1.395845697329377, + "grad_norm": 0.5478616952896118, + "learning_rate": 1e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7602720260620117, + "num_tokens": 1127543855.0, + "step": 2352 + }, + { + "epoch": 1.396439169139466, + "grad_norm": 0.5623833537101746, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7620620727539062, + "num_tokens": 1128017437.0, + "step": 2353 + }, + { + "epoch": 1.3970326409495548, + "grad_norm": 0.5880696177482605, + "learning_rate": 1e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7656488418579102, + "num_tokens": 1128475080.0, + "step": 2354 + }, + { + "epoch": 1.3976261127596439, + "grad_norm": 0.5612945556640625, + "learning_rate": 1e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7668527960777283, + "num_tokens": 1128971411.0, + "step": 2355 + }, + { + "epoch": 1.398219584569733, + "grad_norm": 0.5567473769187927, + "learning_rate": 1e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7566144466400146, + "num_tokens": 1129453626.0, + "step": 2356 + }, + { + "epoch": 1.398813056379822, + "grad_norm": 0.5461642742156982, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7612769603729248, + "num_tokens": 1129947565.0, + "step": 2357 + }, + { + "epoch": 1.3994065281899108, + "grad_norm": 0.5638736486434937, + "learning_rate": 1e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7683777213096619, + "num_tokens": 1130400970.0, + "step": 2358 + }, + { + "epoch": 1.4, + "grad_norm": 0.5489938259124756, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7515852451324463, + "num_tokens": 1130877564.0, + "step": 2359 + }, + { + "epoch": 1.400593471810089, + "grad_norm": 0.562872052192688, + "learning_rate": 1e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7514587640762329, + "num_tokens": 1131322269.0, + "step": 2360 + }, + { + "epoch": 1.401186943620178, + "grad_norm": 0.5249844789505005, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7562100887298584, + "num_tokens": 1131838319.0, + "step": 2361 + }, + { + "epoch": 1.401780415430267, + "grad_norm": 0.5582031011581421, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7646578550338745, + "num_tokens": 1132332369.0, + "step": 2362 + }, + { + "epoch": 1.4023738872403562, + "grad_norm": 0.5436859130859375, + "learning_rate": 1e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7647221684455872, + "num_tokens": 1132817848.0, + "step": 2363 + }, + { + "epoch": 1.4029673590504452, + "grad_norm": 0.5297410488128662, + "learning_rate": 1e-06, + "loss": 0.7701, + "mean_token_accuracy": 0.7602138519287109, + "num_tokens": 1133331446.0, + "step": 2364 + }, + { + "epoch": 1.403560830860534, + "grad_norm": 0.5546438097953796, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7690240740776062, + "num_tokens": 1133787432.0, + "step": 2365 + }, + { + "epoch": 1.4041543026706231, + "grad_norm": 0.5756086707115173, + "learning_rate": 1e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7565200328826904, + "num_tokens": 1134245719.0, + "step": 2366 + }, + { + "epoch": 1.4047477744807122, + "grad_norm": 0.5714012384414673, + "learning_rate": 1e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7723888158798218, + "num_tokens": 1134696890.0, + "step": 2367 + }, + { + "epoch": 1.4053412462908013, + "grad_norm": 0.5421547293663025, + "learning_rate": 1e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.7591654062271118, + "num_tokens": 1135197303.0, + "step": 2368 + }, + { + "epoch": 1.40593471810089, + "grad_norm": 0.5333195328712463, + "learning_rate": 1e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7543271780014038, + "num_tokens": 1135676796.0, + "step": 2369 + }, + { + "epoch": 1.4065281899109792, + "grad_norm": 0.5917748212814331, + "learning_rate": 1e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7630923390388489, + "num_tokens": 1136128529.0, + "step": 2370 + }, + { + "epoch": 1.4071216617210682, + "grad_norm": 0.546647846698761, + "learning_rate": 1e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7724721431732178, + "num_tokens": 1136618227.0, + "step": 2371 + }, + { + "epoch": 1.4077151335311573, + "grad_norm": 0.5768990516662598, + "learning_rate": 1e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.773658037185669, + "num_tokens": 1137073039.0, + "step": 2372 + }, + { + "epoch": 1.4083086053412464, + "grad_norm": 0.5613460540771484, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7585067749023438, + "num_tokens": 1137534340.0, + "step": 2373 + }, + { + "epoch": 1.4089020771513354, + "grad_norm": 0.5364819765090942, + "learning_rate": 1e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7621669769287109, + "num_tokens": 1138014239.0, + "step": 2374 + }, + { + "epoch": 1.4094955489614243, + "grad_norm": 0.5493054986000061, + "learning_rate": 1e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.778272271156311, + "num_tokens": 1138466956.0, + "step": 2375 + }, + { + "epoch": 1.4100890207715133, + "grad_norm": 0.5841337442398071, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.759868860244751, + "num_tokens": 1138917539.0, + "step": 2376 + }, + { + "epoch": 1.4106824925816024, + "grad_norm": 0.5877319574356079, + "learning_rate": 1e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.75151127576828, + "num_tokens": 1139369024.0, + "step": 2377 + }, + { + "epoch": 1.4112759643916915, + "grad_norm": 0.5381548404693604, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7597723007202148, + "num_tokens": 1139840669.0, + "step": 2378 + }, + { + "epoch": 1.4118694362017803, + "grad_norm": 0.5719998478889465, + "learning_rate": 1e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.771625280380249, + "num_tokens": 1140294655.0, + "step": 2379 + }, + { + "epoch": 1.4124629080118694, + "grad_norm": 0.5218430757522583, + "learning_rate": 1e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.7568544149398804, + "num_tokens": 1140814585.0, + "step": 2380 + }, + { + "epoch": 1.4130563798219584, + "grad_norm": 0.5716444253921509, + "learning_rate": 1e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7666007280349731, + "num_tokens": 1141280148.0, + "step": 2381 + }, + { + "epoch": 1.4136498516320475, + "grad_norm": 0.5549851059913635, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7683252096176147, + "num_tokens": 1141736857.0, + "step": 2382 + }, + { + "epoch": 1.4142433234421365, + "grad_norm": 0.5535829663276672, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7642791867256165, + "num_tokens": 1142208672.0, + "step": 2383 + }, + { + "epoch": 1.4148367952522256, + "grad_norm": 0.5624601244926453, + "learning_rate": 1e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.784841001033783, + "num_tokens": 1142687685.0, + "step": 2384 + }, + { + "epoch": 1.4154302670623147, + "grad_norm": 0.5476112961769104, + "learning_rate": 1e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7560732960700989, + "num_tokens": 1143193424.0, + "step": 2385 + }, + { + "epoch": 1.4160237388724035, + "grad_norm": 0.5740953087806702, + "learning_rate": 1e-06, + "loss": 0.701, + "mean_token_accuracy": 0.7763715386390686, + "num_tokens": 1143647190.0, + "step": 2386 + }, + { + "epoch": 1.4166172106824926, + "grad_norm": 0.5764477849006653, + "learning_rate": 1e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.7831794023513794, + "num_tokens": 1144152707.0, + "step": 2387 + }, + { + "epoch": 1.4172106824925816, + "grad_norm": 0.5500415563583374, + "learning_rate": 1e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7743121385574341, + "num_tokens": 1144617128.0, + "step": 2388 + }, + { + "epoch": 1.4178041543026707, + "grad_norm": 0.5429306030273438, + "learning_rate": 1e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.7643124461174011, + "num_tokens": 1145079926.0, + "step": 2389 + }, + { + "epoch": 1.4183976261127595, + "grad_norm": 0.5724273920059204, + "learning_rate": 1e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7571519017219543, + "num_tokens": 1145559360.0, + "step": 2390 + }, + { + "epoch": 1.4189910979228486, + "grad_norm": 0.5701696276664734, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7666561603546143, + "num_tokens": 1146055286.0, + "step": 2391 + }, + { + "epoch": 1.4195845697329377, + "grad_norm": 0.5900562405586243, + "learning_rate": 1e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7514730095863342, + "num_tokens": 1146509736.0, + "step": 2392 + }, + { + "epoch": 1.4201780415430267, + "grad_norm": 0.5610188245773315, + "learning_rate": 1e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7700369358062744, + "num_tokens": 1146971403.0, + "step": 2393 + }, + { + "epoch": 1.4207715133531158, + "grad_norm": 0.5689743161201477, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7571600675582886, + "num_tokens": 1147453536.0, + "step": 2394 + }, + { + "epoch": 1.4213649851632049, + "grad_norm": 0.5019379258155823, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7723355293273926, + "num_tokens": 1148004739.0, + "step": 2395 + }, + { + "epoch": 1.4219584569732937, + "grad_norm": 0.6029088497161865, + "learning_rate": 1e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7449919581413269, + "num_tokens": 1148479258.0, + "step": 2396 + }, + { + "epoch": 1.4225519287833828, + "grad_norm": 0.5560181736946106, + "learning_rate": 1e-06, + "loss": 0.784, + "mean_token_accuracy": 0.7555251121520996, + "num_tokens": 1148938034.0, + "step": 2397 + }, + { + "epoch": 1.4231454005934718, + "grad_norm": 0.535998523235321, + "learning_rate": 1e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7704421877861023, + "num_tokens": 1149448721.0, + "step": 2398 + }, + { + "epoch": 1.423738872403561, + "grad_norm": 0.5462932586669922, + "learning_rate": 1e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7606383562088013, + "num_tokens": 1149931672.0, + "step": 2399 + }, + { + "epoch": 1.4243323442136497, + "grad_norm": 0.5414100885391235, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7611749768257141, + "num_tokens": 1150407760.0, + "step": 2400 + }, + { + "epoch": 1.4249258160237388, + "grad_norm": 0.5523884892463684, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7636024951934814, + "num_tokens": 1150870897.0, + "step": 2401 + }, + { + "epoch": 1.4255192878338279, + "grad_norm": 0.5408405065536499, + "learning_rate": 1e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.7766752243041992, + "num_tokens": 1151392357.0, + "step": 2402 + }, + { + "epoch": 1.426112759643917, + "grad_norm": 0.5402135848999023, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7597343921661377, + "num_tokens": 1151928173.0, + "step": 2403 + }, + { + "epoch": 1.426706231454006, + "grad_norm": 0.5402604341506958, + "learning_rate": 1e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7728263139724731, + "num_tokens": 1152416105.0, + "step": 2404 + }, + { + "epoch": 1.427299703264095, + "grad_norm": 0.566586434841156, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7687227725982666, + "num_tokens": 1152859398.0, + "step": 2405 + }, + { + "epoch": 1.427893175074184, + "grad_norm": 0.5143384337425232, + "learning_rate": 1e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7734208106994629, + "num_tokens": 1153385526.0, + "step": 2406 + }, + { + "epoch": 1.428486646884273, + "grad_norm": 0.5443839430809021, + "learning_rate": 1e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7723552584648132, + "num_tokens": 1153875771.0, + "step": 2407 + }, + { + "epoch": 1.429080118694362, + "grad_norm": 0.5466213822364807, + "learning_rate": 1e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7653430104255676, + "num_tokens": 1154342045.0, + "step": 2408 + }, + { + "epoch": 1.429673590504451, + "grad_norm": 0.5869951844215393, + "learning_rate": 1e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.749701738357544, + "num_tokens": 1154791804.0, + "step": 2409 + }, + { + "epoch": 1.43026706231454, + "grad_norm": 0.5477595925331116, + "learning_rate": 1e-06, + "loss": 0.7498, + "mean_token_accuracy": 0.7634716033935547, + "num_tokens": 1155292568.0, + "step": 2410 + }, + { + "epoch": 1.430860534124629, + "grad_norm": 0.5700995326042175, + "learning_rate": 1e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7548019289970398, + "num_tokens": 1155803336.0, + "step": 2411 + }, + { + "epoch": 1.431454005934718, + "grad_norm": 0.5578590035438538, + "learning_rate": 1e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7739380598068237, + "num_tokens": 1156262202.0, + "step": 2412 + }, + { + "epoch": 1.4320474777448071, + "grad_norm": 0.5481002330780029, + "learning_rate": 1e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7685679197311401, + "num_tokens": 1156721745.0, + "step": 2413 + }, + { + "epoch": 1.4326409495548962, + "grad_norm": 0.5867969989776611, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7534792423248291, + "num_tokens": 1157171117.0, + "step": 2414 + }, + { + "epoch": 1.4332344213649852, + "grad_norm": 0.5493407845497131, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7588769197463989, + "num_tokens": 1157623808.0, + "step": 2415 + }, + { + "epoch": 1.4338278931750743, + "grad_norm": 0.5857632756233215, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7595536112785339, + "num_tokens": 1158048521.0, + "step": 2416 + }, + { + "epoch": 1.4344213649851631, + "grad_norm": 0.5451502799987793, + "learning_rate": 1e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7647914290428162, + "num_tokens": 1158531490.0, + "step": 2417 + }, + { + "epoch": 1.4350148367952522, + "grad_norm": 0.5632843375205994, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7614902853965759, + "num_tokens": 1158988836.0, + "step": 2418 + }, + { + "epoch": 1.4356083086053413, + "grad_norm": 0.5215664505958557, + "learning_rate": 1e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7829017639160156, + "num_tokens": 1159464924.0, + "step": 2419 + }, + { + "epoch": 1.4362017804154303, + "grad_norm": 0.5327317118644714, + "learning_rate": 1e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7487961053848267, + "num_tokens": 1159957330.0, + "step": 2420 + }, + { + "epoch": 1.4367952522255192, + "grad_norm": 0.5286823511123657, + "learning_rate": 1e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7637635469436646, + "num_tokens": 1160447720.0, + "step": 2421 + }, + { + "epoch": 1.4373887240356082, + "grad_norm": 0.5654878616333008, + "learning_rate": 1e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.7542714476585388, + "num_tokens": 1160891163.0, + "step": 2422 + }, + { + "epoch": 1.4379821958456973, + "grad_norm": 0.527784526348114, + "learning_rate": 1e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7668958306312561, + "num_tokens": 1161362456.0, + "step": 2423 + }, + { + "epoch": 1.4385756676557864, + "grad_norm": 0.558974027633667, + "learning_rate": 1e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7728564739227295, + "num_tokens": 1161795710.0, + "step": 2424 + }, + { + "epoch": 1.4391691394658754, + "grad_norm": 0.5416187047958374, + "learning_rate": 1e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7713439464569092, + "num_tokens": 1162242435.0, + "step": 2425 + }, + { + "epoch": 1.4397626112759645, + "grad_norm": 0.6067332625389099, + "learning_rate": 1e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.761673092842102, + "num_tokens": 1162723332.0, + "step": 2426 + }, + { + "epoch": 1.4403560830860533, + "grad_norm": 0.548831582069397, + "learning_rate": 1e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7515146732330322, + "num_tokens": 1163186473.0, + "step": 2427 + }, + { + "epoch": 1.4409495548961424, + "grad_norm": 0.5693408846855164, + "learning_rate": 1e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7661565542221069, + "num_tokens": 1163608908.0, + "step": 2428 + }, + { + "epoch": 1.4415430267062315, + "grad_norm": 0.5816534757614136, + "learning_rate": 1e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7711392641067505, + "num_tokens": 1164072720.0, + "step": 2429 + }, + { + "epoch": 1.4421364985163205, + "grad_norm": 0.5610699653625488, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7589519619941711, + "num_tokens": 1164550014.0, + "step": 2430 + }, + { + "epoch": 1.4427299703264094, + "grad_norm": 0.5529942512512207, + "learning_rate": 1e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7571796178817749, + "num_tokens": 1165032213.0, + "step": 2431 + }, + { + "epoch": 1.4433234421364984, + "grad_norm": 0.5429450273513794, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7645217180252075, + "num_tokens": 1165551233.0, + "step": 2432 + }, + { + "epoch": 1.4439169139465875, + "grad_norm": 0.5477073192596436, + "learning_rate": 1e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.765326976776123, + "num_tokens": 1166023369.0, + "step": 2433 + }, + { + "epoch": 1.4445103857566766, + "grad_norm": 0.5351706743240356, + "learning_rate": 1e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7520833015441895, + "num_tokens": 1166538326.0, + "step": 2434 + }, + { + "epoch": 1.4451038575667656, + "grad_norm": 0.5412142276763916, + "learning_rate": 1e-06, + "loss": 0.768, + "mean_token_accuracy": 0.7599989175796509, + "num_tokens": 1167005622.0, + "step": 2435 + }, + { + "epoch": 1.4456973293768547, + "grad_norm": 0.573226809501648, + "learning_rate": 1e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7690973281860352, + "num_tokens": 1167483490.0, + "step": 2436 + }, + { + "epoch": 1.4462908011869438, + "grad_norm": 0.5335953235626221, + "learning_rate": 1e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7705270051956177, + "num_tokens": 1167982948.0, + "step": 2437 + }, + { + "epoch": 1.4468842729970326, + "grad_norm": 0.5328845381736755, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7723779678344727, + "num_tokens": 1168461661.0, + "step": 2438 + }, + { + "epoch": 1.4474777448071217, + "grad_norm": 0.5303167104721069, + "learning_rate": 1e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.769760012626648, + "num_tokens": 1168925121.0, + "step": 2439 + }, + { + "epoch": 1.4480712166172107, + "grad_norm": 0.5640119314193726, + "learning_rate": 1e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7677106857299805, + "num_tokens": 1169432068.0, + "step": 2440 + }, + { + "epoch": 1.4486646884272998, + "grad_norm": 0.5569595694541931, + "learning_rate": 1e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7745779752731323, + "num_tokens": 1169942621.0, + "step": 2441 + }, + { + "epoch": 1.4492581602373886, + "grad_norm": 0.5384665131568909, + "learning_rate": 1e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7773127555847168, + "num_tokens": 1170411619.0, + "step": 2442 + }, + { + "epoch": 1.4498516320474777, + "grad_norm": 0.5156204104423523, + "learning_rate": 1e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7616449594497681, + "num_tokens": 1170933370.0, + "step": 2443 + }, + { + "epoch": 1.4504451038575668, + "grad_norm": 0.5413007736206055, + "learning_rate": 1e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7721219062805176, + "num_tokens": 1171404477.0, + "step": 2444 + }, + { + "epoch": 1.4510385756676558, + "grad_norm": 0.5711852312088013, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7669609785079956, + "num_tokens": 1171870809.0, + "step": 2445 + }, + { + "epoch": 1.4516320474777449, + "grad_norm": 0.52338707447052, + "learning_rate": 1e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.7713571190834045, + "num_tokens": 1172341668.0, + "step": 2446 + }, + { + "epoch": 1.452225519287834, + "grad_norm": 0.5378754734992981, + "learning_rate": 1e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7619167566299438, + "num_tokens": 1172820394.0, + "step": 2447 + }, + { + "epoch": 1.4528189910979228, + "grad_norm": 0.5410083532333374, + "learning_rate": 1e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7542202472686768, + "num_tokens": 1173291733.0, + "step": 2448 + }, + { + "epoch": 1.4534124629080118, + "grad_norm": 0.5245661735534668, + "learning_rate": 1e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7527890205383301, + "num_tokens": 1173763795.0, + "step": 2449 + }, + { + "epoch": 1.454005934718101, + "grad_norm": 0.5499823093414307, + "learning_rate": 1e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.7776148319244385, + "num_tokens": 1174249978.0, + "step": 2450 + }, + { + "epoch": 1.45459940652819, + "grad_norm": 0.5317578315734863, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7592644691467285, + "num_tokens": 1174745579.0, + "step": 2451 + }, + { + "epoch": 1.4551928783382788, + "grad_norm": 0.5143936276435852, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7722513675689697, + "num_tokens": 1175247864.0, + "step": 2452 + }, + { + "epoch": 1.4557863501483679, + "grad_norm": 0.5331985950469971, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7713655233383179, + "num_tokens": 1175718522.0, + "step": 2453 + }, + { + "epoch": 1.456379821958457, + "grad_norm": 0.523323655128479, + "learning_rate": 1e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7813296318054199, + "num_tokens": 1176200443.0, + "step": 2454 + }, + { + "epoch": 1.456973293768546, + "grad_norm": 0.610562264919281, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.75342857837677, + "num_tokens": 1176639148.0, + "step": 2455 + }, + { + "epoch": 1.457566765578635, + "grad_norm": 0.5573117733001709, + "learning_rate": 1e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7731418609619141, + "num_tokens": 1177088651.0, + "step": 2456 + }, + { + "epoch": 1.4581602373887241, + "grad_norm": 0.5680326819419861, + "learning_rate": 1e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7703876495361328, + "num_tokens": 1177552775.0, + "step": 2457 + }, + { + "epoch": 1.458753709198813, + "grad_norm": 0.5433498024940491, + "learning_rate": 1e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7643319964408875, + "num_tokens": 1178032029.0, + "step": 2458 + }, + { + "epoch": 1.459347181008902, + "grad_norm": 0.5523362755775452, + "learning_rate": 1e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7585595846176147, + "num_tokens": 1178521142.0, + "step": 2459 + }, + { + "epoch": 1.459940652818991, + "grad_norm": 0.525613009929657, + "learning_rate": 1e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.7825213074684143, + "num_tokens": 1179000570.0, + "step": 2460 + }, + { + "epoch": 1.4605341246290802, + "grad_norm": 0.5655561089515686, + "learning_rate": 1e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7613626718521118, + "num_tokens": 1179467958.0, + "step": 2461 + }, + { + "epoch": 1.461127596439169, + "grad_norm": 0.5264009833335876, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7666664123535156, + "num_tokens": 1179962718.0, + "step": 2462 + }, + { + "epoch": 1.461721068249258, + "grad_norm": 0.526784360408783, + "learning_rate": 1e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7660195827484131, + "num_tokens": 1180458604.0, + "step": 2463 + }, + { + "epoch": 1.4623145400593471, + "grad_norm": 0.5791081786155701, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7644197940826416, + "num_tokens": 1180883558.0, + "step": 2464 + }, + { + "epoch": 1.4629080118694362, + "grad_norm": 0.5628628134727478, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7466832399368286, + "num_tokens": 1181352027.0, + "step": 2465 + }, + { + "epoch": 1.4635014836795253, + "grad_norm": 0.5338968634605408, + "learning_rate": 1e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7663939595222473, + "num_tokens": 1181822978.0, + "step": 2466 + }, + { + "epoch": 1.4640949554896143, + "grad_norm": 0.5627351403236389, + "learning_rate": 1e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7565717697143555, + "num_tokens": 1182273319.0, + "step": 2467 + }, + { + "epoch": 1.4646884272997034, + "grad_norm": 0.5449380278587341, + "learning_rate": 1e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7697523236274719, + "num_tokens": 1182738239.0, + "step": 2468 + }, + { + "epoch": 1.4652818991097922, + "grad_norm": 0.5279460549354553, + "learning_rate": 1e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7762809991836548, + "num_tokens": 1183236964.0, + "step": 2469 + }, + { + "epoch": 1.4658753709198813, + "grad_norm": 0.5257254838943481, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7598361372947693, + "num_tokens": 1183751227.0, + "step": 2470 + }, + { + "epoch": 1.4664688427299704, + "grad_norm": 0.5402024984359741, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7552227973937988, + "num_tokens": 1184233107.0, + "step": 2471 + }, + { + "epoch": 1.4670623145400594, + "grad_norm": 0.5686319470405579, + "learning_rate": 1e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7683528661727905, + "num_tokens": 1184691769.0, + "step": 2472 + }, + { + "epoch": 1.4676557863501483, + "grad_norm": 0.563861072063446, + "learning_rate": 1e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7602972984313965, + "num_tokens": 1185161020.0, + "step": 2473 + }, + { + "epoch": 1.4682492581602373, + "grad_norm": 0.5362774133682251, + "learning_rate": 1e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7728675603866577, + "num_tokens": 1185662932.0, + "step": 2474 + }, + { + "epoch": 1.4688427299703264, + "grad_norm": 0.5446257591247559, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7658478021621704, + "num_tokens": 1186138420.0, + "step": 2475 + }, + { + "epoch": 1.4694362017804155, + "grad_norm": 0.5308637022972107, + "learning_rate": 1e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7553166747093201, + "num_tokens": 1186630660.0, + "step": 2476 + }, + { + "epoch": 1.4700296735905045, + "grad_norm": 0.5731183290481567, + "learning_rate": 1e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7779577970504761, + "num_tokens": 1187069642.0, + "step": 2477 + }, + { + "epoch": 1.4706231454005936, + "grad_norm": 0.5700432658195496, + "learning_rate": 1e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.7780648469924927, + "num_tokens": 1187541527.0, + "step": 2478 + }, + { + "epoch": 1.4712166172106824, + "grad_norm": 0.5518674254417419, + "learning_rate": 1e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.770092785358429, + "num_tokens": 1188066411.0, + "step": 2479 + }, + { + "epoch": 1.4718100890207715, + "grad_norm": 0.550613522529602, + "learning_rate": 1e-06, + "loss": 0.7304, + "mean_token_accuracy": 0.7701877355575562, + "num_tokens": 1188576946.0, + "step": 2480 + }, + { + "epoch": 1.4724035608308605, + "grad_norm": 0.5756472945213318, + "learning_rate": 1e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.76140958070755, + "num_tokens": 1189002800.0, + "step": 2481 + }, + { + "epoch": 1.4729970326409496, + "grad_norm": 0.5574402809143066, + "learning_rate": 1e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7579771280288696, + "num_tokens": 1189502294.0, + "step": 2482 + }, + { + "epoch": 1.4735905044510385, + "grad_norm": 0.5849486589431763, + "learning_rate": 1e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7569551467895508, + "num_tokens": 1189966066.0, + "step": 2483 + }, + { + "epoch": 1.4741839762611275, + "grad_norm": 0.5311191082000732, + "learning_rate": 1e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7798656225204468, + "num_tokens": 1190449378.0, + "step": 2484 + }, + { + "epoch": 1.4747774480712166, + "grad_norm": 0.5573433637619019, + "learning_rate": 1e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7666304111480713, + "num_tokens": 1190931504.0, + "step": 2485 + }, + { + "epoch": 1.4753709198813056, + "grad_norm": 0.5418586730957031, + "learning_rate": 1e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7630970478057861, + "num_tokens": 1191399498.0, + "step": 2486 + }, + { + "epoch": 1.4759643916913947, + "grad_norm": 0.5538633465766907, + "learning_rate": 1e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7683066129684448, + "num_tokens": 1191855945.0, + "step": 2487 + }, + { + "epoch": 1.4765578635014838, + "grad_norm": 0.5330761075019836, + "learning_rate": 1e-06, + "loss": 0.6822, + "mean_token_accuracy": 0.7813341021537781, + "num_tokens": 1192347402.0, + "step": 2488 + }, + { + "epoch": 1.4771513353115728, + "grad_norm": 0.5694212317466736, + "learning_rate": 1e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7523484230041504, + "num_tokens": 1192798141.0, + "step": 2489 + }, + { + "epoch": 1.4777448071216617, + "grad_norm": 0.5946594476699829, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7575985193252563, + "num_tokens": 1193218180.0, + "step": 2490 + }, + { + "epoch": 1.4783382789317507, + "grad_norm": 0.5486148595809937, + "learning_rate": 1e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7635674476623535, + "num_tokens": 1193703228.0, + "step": 2491 + }, + { + "epoch": 1.4789317507418398, + "grad_norm": 0.5701727867126465, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7583858966827393, + "num_tokens": 1194138731.0, + "step": 2492 + }, + { + "epoch": 1.4795252225519289, + "grad_norm": 0.551194965839386, + "learning_rate": 1e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7697739601135254, + "num_tokens": 1194601489.0, + "step": 2493 + }, + { + "epoch": 1.4801186943620177, + "grad_norm": 0.566530704498291, + "learning_rate": 1e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7595304846763611, + "num_tokens": 1195072784.0, + "step": 2494 + }, + { + "epoch": 1.4807121661721068, + "grad_norm": 0.5696130394935608, + "learning_rate": 1e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7652132511138916, + "num_tokens": 1195500791.0, + "step": 2495 + }, + { + "epoch": 1.4813056379821958, + "grad_norm": 0.5440458059310913, + "learning_rate": 1e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.7821950912475586, + "num_tokens": 1195947240.0, + "step": 2496 + }, + { + "epoch": 1.481899109792285, + "grad_norm": 0.5772197842597961, + "learning_rate": 1e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7604237794876099, + "num_tokens": 1196424345.0, + "step": 2497 + }, + { + "epoch": 1.482492581602374, + "grad_norm": 0.5531192421913147, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7623881697654724, + "num_tokens": 1196903708.0, + "step": 2498 + }, + { + "epoch": 1.483086053412463, + "grad_norm": 0.5342877507209778, + "learning_rate": 1e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7657689452171326, + "num_tokens": 1197379470.0, + "step": 2499 + }, + { + "epoch": 1.4836795252225519, + "grad_norm": 0.5883756875991821, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7515919208526611, + "num_tokens": 1197804283.0, + "step": 2500 + }, + { + "epoch": 1.484272997032641, + "grad_norm": 0.5666540861129761, + "learning_rate": 1e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7698197364807129, + "num_tokens": 1198315188.0, + "step": 2501 + }, + { + "epoch": 1.48486646884273, + "grad_norm": 0.5230048298835754, + "learning_rate": 1e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.769633412361145, + "num_tokens": 1198817988.0, + "step": 2502 + }, + { + "epoch": 1.485459940652819, + "grad_norm": 0.5251368880271912, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7663785219192505, + "num_tokens": 1199297013.0, + "step": 2503 + }, + { + "epoch": 1.486053412462908, + "grad_norm": 0.6336071491241455, + "learning_rate": 1e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.765349268913269, + "num_tokens": 1199758450.0, + "step": 2504 + }, + { + "epoch": 1.486646884272997, + "grad_norm": 0.5653886795043945, + "learning_rate": 1e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7758865356445312, + "num_tokens": 1200233079.0, + "step": 2505 + }, + { + "epoch": 1.487240356083086, + "grad_norm": 0.5566973686218262, + "learning_rate": 1e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7611565589904785, + "num_tokens": 1200705813.0, + "step": 2506 + }, + { + "epoch": 1.487833827893175, + "grad_norm": 0.5577322244644165, + "learning_rate": 1e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7734246253967285, + "num_tokens": 1201181516.0, + "step": 2507 + }, + { + "epoch": 1.4884272997032642, + "grad_norm": 0.601364016532898, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7721495628356934, + "num_tokens": 1201641501.0, + "step": 2508 + }, + { + "epoch": 1.4890207715133532, + "grad_norm": 0.561059832572937, + "learning_rate": 1e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7630107402801514, + "num_tokens": 1202106244.0, + "step": 2509 + }, + { + "epoch": 1.489614243323442, + "grad_norm": 0.5727632641792297, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7710975408554077, + "num_tokens": 1202544729.0, + "step": 2510 + }, + { + "epoch": 1.4902077151335311, + "grad_norm": 0.5660256743431091, + "learning_rate": 1e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.775640606880188, + "num_tokens": 1203045571.0, + "step": 2511 + }, + { + "epoch": 1.4908011869436202, + "grad_norm": 0.5700075030326843, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7659658193588257, + "num_tokens": 1203530472.0, + "step": 2512 + }, + { + "epoch": 1.4913946587537092, + "grad_norm": 0.5371332764625549, + "learning_rate": 1e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7634052634239197, + "num_tokens": 1204036805.0, + "step": 2513 + }, + { + "epoch": 1.491988130563798, + "grad_norm": 0.5362785458564758, + "learning_rate": 1e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7711356282234192, + "num_tokens": 1204548712.0, + "step": 2514 + }, + { + "epoch": 1.4925816023738872, + "grad_norm": 0.5575463175773621, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7489409446716309, + "num_tokens": 1205037018.0, + "step": 2515 + }, + { + "epoch": 1.4931750741839762, + "grad_norm": 0.5587406754493713, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7609466314315796, + "num_tokens": 1205496151.0, + "step": 2516 + }, + { + "epoch": 1.4937685459940653, + "grad_norm": 0.5582229495048523, + "learning_rate": 1e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7574433088302612, + "num_tokens": 1205943027.0, + "step": 2517 + }, + { + "epoch": 1.4943620178041543, + "grad_norm": 0.56627357006073, + "learning_rate": 1e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7761483192443848, + "num_tokens": 1206390614.0, + "step": 2518 + }, + { + "epoch": 1.4949554896142434, + "grad_norm": 0.5758493542671204, + "learning_rate": 1e-06, + "loss": 0.649, + "mean_token_accuracy": 0.7898757457733154, + "num_tokens": 1206899885.0, + "step": 2519 + }, + { + "epoch": 1.4955489614243325, + "grad_norm": 0.5478986501693726, + "learning_rate": 1e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7572246789932251, + "num_tokens": 1207357106.0, + "step": 2520 + }, + { + "epoch": 1.4961424332344213, + "grad_norm": 0.5606705546379089, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7643165588378906, + "num_tokens": 1207805410.0, + "step": 2521 + }, + { + "epoch": 1.4967359050445104, + "grad_norm": 0.5470688343048096, + "learning_rate": 1e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7770836353302002, + "num_tokens": 1208272668.0, + "step": 2522 + }, + { + "epoch": 1.4973293768545994, + "grad_norm": 0.5236456990242004, + "learning_rate": 1e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7679167985916138, + "num_tokens": 1208756734.0, + "step": 2523 + }, + { + "epoch": 1.4979228486646885, + "grad_norm": 0.5175266861915588, + "learning_rate": 1e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7611521482467651, + "num_tokens": 1209243354.0, + "step": 2524 + }, + { + "epoch": 1.4985163204747773, + "grad_norm": 0.5403431057929993, + "learning_rate": 1e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7741332054138184, + "num_tokens": 1209731097.0, + "step": 2525 + }, + { + "epoch": 1.4991097922848664, + "grad_norm": 0.5297387838363647, + "learning_rate": 1e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7715771198272705, + "num_tokens": 1210241200.0, + "step": 2526 + }, + { + "epoch": 1.4997032640949555, + "grad_norm": 0.5480251312255859, + "learning_rate": 1e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7580571174621582, + "num_tokens": 1210720150.0, + "step": 2527 + }, + { + "epoch": 1.5002967359050445, + "grad_norm": 0.5031856298446655, + "learning_rate": 1e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7526285648345947, + "num_tokens": 1211223041.0, + "step": 2528 + }, + { + "epoch": 1.5008902077151336, + "grad_norm": 0.53411465883255, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7631957530975342, + "num_tokens": 1211697487.0, + "step": 2529 + }, + { + "epoch": 1.5014836795252227, + "grad_norm": 0.5200777053833008, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7647758722305298, + "num_tokens": 1212196780.0, + "step": 2530 + }, + { + "epoch": 1.5020771513353117, + "grad_norm": 0.5251827239990234, + "learning_rate": 1e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7676801681518555, + "num_tokens": 1212715871.0, + "step": 2531 + }, + { + "epoch": 1.5026706231454006, + "grad_norm": 0.546850860118866, + "learning_rate": 1e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.760602593421936, + "num_tokens": 1213162647.0, + "step": 2532 + }, + { + "epoch": 1.5032640949554896, + "grad_norm": 0.568973958492279, + "learning_rate": 1e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7498696446418762, + "num_tokens": 1213633405.0, + "step": 2533 + }, + { + "epoch": 1.5038575667655787, + "grad_norm": 0.5583697557449341, + "learning_rate": 1e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7532444596290588, + "num_tokens": 1214101648.0, + "step": 2534 + }, + { + "epoch": 1.5044510385756675, + "grad_norm": 0.5459467768669128, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7639083862304688, + "num_tokens": 1214558113.0, + "step": 2535 + }, + { + "epoch": 1.5050445103857566, + "grad_norm": 0.5558127164840698, + "learning_rate": 1e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.7608716487884521, + "num_tokens": 1215047062.0, + "step": 2536 + }, + { + "epoch": 1.5056379821958457, + "grad_norm": 0.5859026908874512, + "learning_rate": 1e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7622963190078735, + "num_tokens": 1215508539.0, + "step": 2537 + }, + { + "epoch": 1.5062314540059347, + "grad_norm": 0.5577350854873657, + "learning_rate": 1e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7650796175003052, + "num_tokens": 1215985563.0, + "step": 2538 + }, + { + "epoch": 1.5068249258160238, + "grad_norm": 0.5617337822914124, + "learning_rate": 1e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7766826748847961, + "num_tokens": 1216456065.0, + "step": 2539 + }, + { + "epoch": 1.5074183976261128, + "grad_norm": 0.5364125370979309, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7721276879310608, + "num_tokens": 1216928181.0, + "step": 2540 + }, + { + "epoch": 1.508011869436202, + "grad_norm": 0.517488956451416, + "learning_rate": 1e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7747794985771179, + "num_tokens": 1217468435.0, + "step": 2541 + }, + { + "epoch": 1.5086053412462908, + "grad_norm": 0.5495137572288513, + "learning_rate": 1e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7542283535003662, + "num_tokens": 1217955115.0, + "step": 2542 + }, + { + "epoch": 1.5091988130563798, + "grad_norm": 0.5572502613067627, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.762259840965271, + "num_tokens": 1218419858.0, + "step": 2543 + }, + { + "epoch": 1.5097922848664689, + "grad_norm": 0.5475000739097595, + "learning_rate": 1e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.76119065284729, + "num_tokens": 1218917073.0, + "step": 2544 + }, + { + "epoch": 1.5103857566765577, + "grad_norm": 0.5546038746833801, + "learning_rate": 1e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7562327980995178, + "num_tokens": 1219401366.0, + "step": 2545 + }, + { + "epoch": 1.5109792284866468, + "grad_norm": 0.5204025506973267, + "learning_rate": 1e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.768298327922821, + "num_tokens": 1219912894.0, + "step": 2546 + }, + { + "epoch": 1.5115727002967358, + "grad_norm": 0.5476945638656616, + "learning_rate": 1e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7712002396583557, + "num_tokens": 1220383994.0, + "step": 2547 + }, + { + "epoch": 1.512166172106825, + "grad_norm": 0.5300473570823669, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7660727500915527, + "num_tokens": 1220895442.0, + "step": 2548 + }, + { + "epoch": 1.512759643916914, + "grad_norm": 0.5442838072776794, + "learning_rate": 1e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7681301832199097, + "num_tokens": 1221353014.0, + "step": 2549 + }, + { + "epoch": 1.513353115727003, + "grad_norm": 0.5497540831565857, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7713061571121216, + "num_tokens": 1221833648.0, + "step": 2550 + }, + { + "epoch": 1.513946587537092, + "grad_norm": 0.5142173171043396, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7574021816253662, + "num_tokens": 1222317289.0, + "step": 2551 + }, + { + "epoch": 1.5145400593471812, + "grad_norm": 0.5726134181022644, + "learning_rate": 1e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7712669372558594, + "num_tokens": 1222791858.0, + "step": 2552 + }, + { + "epoch": 1.51513353115727, + "grad_norm": 0.521561324596405, + "learning_rate": 1e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7692556977272034, + "num_tokens": 1223295822.0, + "step": 2553 + }, + { + "epoch": 1.515727002967359, + "grad_norm": 0.5088169574737549, + "learning_rate": 1e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7723812460899353, + "num_tokens": 1223789435.0, + "step": 2554 + }, + { + "epoch": 1.516320474777448, + "grad_norm": 0.5276128053665161, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7666445970535278, + "num_tokens": 1224315558.0, + "step": 2555 + }, + { + "epoch": 1.516913946587537, + "grad_norm": 0.5328348875045776, + "learning_rate": 1e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.7835264205932617, + "num_tokens": 1224818129.0, + "step": 2556 + }, + { + "epoch": 1.517507418397626, + "grad_norm": 0.5164393782615662, + "learning_rate": 1e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7596807479858398, + "num_tokens": 1225339253.0, + "step": 2557 + }, + { + "epoch": 1.518100890207715, + "grad_norm": 0.5245217084884644, + "learning_rate": 1e-06, + "loss": 0.768, + "mean_token_accuracy": 0.7601891160011292, + "num_tokens": 1225822505.0, + "step": 2558 + }, + { + "epoch": 1.5186943620178042, + "grad_norm": 0.5457455515861511, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7732075452804565, + "num_tokens": 1226351233.0, + "step": 2559 + }, + { + "epoch": 1.5192878338278932, + "grad_norm": 0.6229371428489685, + "learning_rate": 1e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7672247886657715, + "num_tokens": 1226818090.0, + "step": 2560 + }, + { + "epoch": 1.5198813056379823, + "grad_norm": 0.5264418125152588, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7720630168914795, + "num_tokens": 1227349541.0, + "step": 2561 + }, + { + "epoch": 1.5204747774480714, + "grad_norm": 0.5742418169975281, + "learning_rate": 1e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7535043954849243, + "num_tokens": 1227782214.0, + "step": 2562 + }, + { + "epoch": 1.5210682492581602, + "grad_norm": 0.5826101899147034, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7566293478012085, + "num_tokens": 1228248029.0, + "step": 2563 + }, + { + "epoch": 1.5216617210682493, + "grad_norm": 0.5824927091598511, + "learning_rate": 1e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7756719589233398, + "num_tokens": 1228709157.0, + "step": 2564 + }, + { + "epoch": 1.5222551928783383, + "grad_norm": 0.5387170910835266, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7690290808677673, + "num_tokens": 1229180529.0, + "step": 2565 + }, + { + "epoch": 1.5228486646884272, + "grad_norm": 0.5473017692565918, + "learning_rate": 1e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.768409013748169, + "num_tokens": 1229627698.0, + "step": 2566 + }, + { + "epoch": 1.5234421364985162, + "grad_norm": 0.5733948349952698, + "learning_rate": 1e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7609529495239258, + "num_tokens": 1230074353.0, + "step": 2567 + }, + { + "epoch": 1.5240356083086053, + "grad_norm": 0.5686808824539185, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7689172029495239, + "num_tokens": 1230545591.0, + "step": 2568 + }, + { + "epoch": 1.5246290801186944, + "grad_norm": 0.5497048497200012, + "learning_rate": 1e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7647186517715454, + "num_tokens": 1231034008.0, + "step": 2569 + }, + { + "epoch": 1.5252225519287834, + "grad_norm": 0.5268952250480652, + "learning_rate": 1e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7719261646270752, + "num_tokens": 1231535334.0, + "step": 2570 + }, + { + "epoch": 1.5258160237388725, + "grad_norm": 0.5453515648841858, + "learning_rate": 1e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.758637547492981, + "num_tokens": 1232024925.0, + "step": 2571 + }, + { + "epoch": 1.5264094955489615, + "grad_norm": 0.5519124865531921, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7624756097793579, + "num_tokens": 1232520044.0, + "step": 2572 + }, + { + "epoch": 1.5270029673590506, + "grad_norm": 0.5494973063468933, + "learning_rate": 1e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7587342262268066, + "num_tokens": 1232981905.0, + "step": 2573 + }, + { + "epoch": 1.5275964391691395, + "grad_norm": 0.5209267735481262, + "learning_rate": 1e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7680883407592773, + "num_tokens": 1233498088.0, + "step": 2574 + }, + { + "epoch": 1.5281899109792285, + "grad_norm": 0.5460824966430664, + "learning_rate": 1e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7667712569236755, + "num_tokens": 1233980293.0, + "step": 2575 + }, + { + "epoch": 1.5287833827893174, + "grad_norm": 0.5722175240516663, + "learning_rate": 1e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7707998752593994, + "num_tokens": 1234458723.0, + "step": 2576 + }, + { + "epoch": 1.5293768545994064, + "grad_norm": 0.5428505539894104, + "learning_rate": 1e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.760263204574585, + "num_tokens": 1234932641.0, + "step": 2577 + }, + { + "epoch": 1.5299703264094955, + "grad_norm": 0.5829002261161804, + "learning_rate": 1e-06, + "loss": 0.7303, + "mean_token_accuracy": 0.7700774669647217, + "num_tokens": 1235384915.0, + "step": 2578 + }, + { + "epoch": 1.5305637982195845, + "grad_norm": 0.5467536449432373, + "learning_rate": 1e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.761904776096344, + "num_tokens": 1235846226.0, + "step": 2579 + }, + { + "epoch": 1.5311572700296736, + "grad_norm": 0.49227654933929443, + "learning_rate": 1e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7626990079879761, + "num_tokens": 1236397524.0, + "step": 2580 + }, + { + "epoch": 1.5317507418397627, + "grad_norm": 0.5579875707626343, + "learning_rate": 1e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7567328214645386, + "num_tokens": 1236868423.0, + "step": 2581 + }, + { + "epoch": 1.5323442136498517, + "grad_norm": 0.5052948594093323, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7662342190742493, + "num_tokens": 1237421417.0, + "step": 2582 + }, + { + "epoch": 1.5329376854599408, + "grad_norm": 0.5282191634178162, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7677465677261353, + "num_tokens": 1237932274.0, + "step": 2583 + }, + { + "epoch": 1.5335311572700296, + "grad_norm": 0.5399059653282166, + "learning_rate": 1e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7644933462142944, + "num_tokens": 1238415418.0, + "step": 2584 + }, + { + "epoch": 1.5341246290801187, + "grad_norm": 0.5328333973884583, + "learning_rate": 1e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7701075077056885, + "num_tokens": 1238918782.0, + "step": 2585 + }, + { + "epoch": 1.5347181008902078, + "grad_norm": 0.49045640230178833, + "learning_rate": 1e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7682604789733887, + "num_tokens": 1239443554.0, + "step": 2586 + }, + { + "epoch": 1.5353115727002966, + "grad_norm": 0.5256922245025635, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7704019546508789, + "num_tokens": 1239951880.0, + "step": 2587 + }, + { + "epoch": 1.5359050445103857, + "grad_norm": 0.5143671035766602, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7683459520339966, + "num_tokens": 1240460005.0, + "step": 2588 + }, + { + "epoch": 1.5364985163204747, + "grad_norm": 0.5504940152168274, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7635622620582581, + "num_tokens": 1240898609.0, + "step": 2589 + }, + { + "epoch": 1.5370919881305638, + "grad_norm": 0.5613613128662109, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7728236317634583, + "num_tokens": 1241376114.0, + "step": 2590 + }, + { + "epoch": 1.5376854599406529, + "grad_norm": 0.5723199844360352, + "learning_rate": 1e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7486757636070251, + "num_tokens": 1241831591.0, + "step": 2591 + }, + { + "epoch": 1.538278931750742, + "grad_norm": 0.5315283536911011, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7597417831420898, + "num_tokens": 1242344086.0, + "step": 2592 + }, + { + "epoch": 1.538872403560831, + "grad_norm": 0.533554196357727, + "learning_rate": 1e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7768828272819519, + "num_tokens": 1242822795.0, + "step": 2593 + }, + { + "epoch": 1.5394658753709198, + "grad_norm": 0.5667798519134521, + "learning_rate": 1e-06, + "loss": 0.767, + "mean_token_accuracy": 0.7590458989143372, + "num_tokens": 1243284458.0, + "step": 2594 + }, + { + "epoch": 1.540059347181009, + "grad_norm": 0.5390031933784485, + "learning_rate": 1e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.7836577296257019, + "num_tokens": 1243752674.0, + "step": 2595 + }, + { + "epoch": 1.540652818991098, + "grad_norm": 0.5139046907424927, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7611021995544434, + "num_tokens": 1244265683.0, + "step": 2596 + }, + { + "epoch": 1.5412462908011868, + "grad_norm": 0.5529632568359375, + "learning_rate": 1e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7781237959861755, + "num_tokens": 1244747663.0, + "step": 2597 + }, + { + "epoch": 1.5418397626112759, + "grad_norm": 0.5280836224555969, + "learning_rate": 1e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7745348811149597, + "num_tokens": 1245252909.0, + "step": 2598 + }, + { + "epoch": 1.542433234421365, + "grad_norm": 0.5156794190406799, + "learning_rate": 1e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7754648327827454, + "num_tokens": 1245713956.0, + "step": 2599 + }, + { + "epoch": 1.543026706231454, + "grad_norm": 0.5273917317390442, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7634184956550598, + "num_tokens": 1246200683.0, + "step": 2600 + }, + { + "epoch": 1.543620178041543, + "grad_norm": 0.5294809937477112, + "learning_rate": 1e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7569997310638428, + "num_tokens": 1246695506.0, + "step": 2601 + }, + { + "epoch": 1.5442136498516321, + "grad_norm": 0.532188892364502, + "learning_rate": 1e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7584664821624756, + "num_tokens": 1247166135.0, + "step": 2602 + }, + { + "epoch": 1.5448071216617212, + "grad_norm": 0.5464181900024414, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7597296237945557, + "num_tokens": 1247629695.0, + "step": 2603 + }, + { + "epoch": 1.5454005934718102, + "grad_norm": 0.5358730554580688, + "learning_rate": 1e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.7778972387313843, + "num_tokens": 1248112198.0, + "step": 2604 + }, + { + "epoch": 1.545994065281899, + "grad_norm": 0.526755690574646, + "learning_rate": 1e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7657215595245361, + "num_tokens": 1248647050.0, + "step": 2605 + }, + { + "epoch": 1.5465875370919882, + "grad_norm": 0.5373163223266602, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7469339370727539, + "num_tokens": 1249123296.0, + "step": 2606 + }, + { + "epoch": 1.547181008902077, + "grad_norm": 0.5326826572418213, + "learning_rate": 1e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7642334699630737, + "num_tokens": 1249621183.0, + "step": 2607 + }, + { + "epoch": 1.547774480712166, + "grad_norm": 0.5074824094772339, + "learning_rate": 1e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7685761451721191, + "num_tokens": 1250125962.0, + "step": 2608 + }, + { + "epoch": 1.5483679525222551, + "grad_norm": 0.5846856832504272, + "learning_rate": 1e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.7550417184829712, + "num_tokens": 1250585472.0, + "step": 2609 + }, + { + "epoch": 1.5489614243323442, + "grad_norm": 0.5726735591888428, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7593029737472534, + "num_tokens": 1251034820.0, + "step": 2610 + }, + { + "epoch": 1.5495548961424332, + "grad_norm": 0.5085038542747498, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7684415578842163, + "num_tokens": 1251551547.0, + "step": 2611 + }, + { + "epoch": 1.5501483679525223, + "grad_norm": 0.5541948080062866, + "learning_rate": 1e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.768602728843689, + "num_tokens": 1252047572.0, + "step": 2612 + }, + { + "epoch": 1.5507418397626114, + "grad_norm": 0.5595284104347229, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7609292268753052, + "num_tokens": 1252507300.0, + "step": 2613 + }, + { + "epoch": 1.5513353115727004, + "grad_norm": 0.5319796800613403, + "learning_rate": 1e-06, + "loss": 0.6772, + "mean_token_accuracy": 0.784482479095459, + "num_tokens": 1252991891.0, + "step": 2614 + }, + { + "epoch": 1.5519287833827893, + "grad_norm": 0.5454875230789185, + "learning_rate": 1e-06, + "loss": 0.7065, + "mean_token_accuracy": 0.7751778960227966, + "num_tokens": 1253459099.0, + "step": 2615 + }, + { + "epoch": 1.5525222551928783, + "grad_norm": 0.5575346946716309, + "learning_rate": 1e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.755136251449585, + "num_tokens": 1253917022.0, + "step": 2616 + }, + { + "epoch": 1.5531157270029674, + "grad_norm": 0.5563409328460693, + "learning_rate": 1e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7644764184951782, + "num_tokens": 1254359044.0, + "step": 2617 + }, + { + "epoch": 1.5537091988130562, + "grad_norm": 0.5223515629768372, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7643018960952759, + "num_tokens": 1254855409.0, + "step": 2618 + }, + { + "epoch": 1.5543026706231453, + "grad_norm": 0.5747600197792053, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7713368535041809, + "num_tokens": 1255300188.0, + "step": 2619 + }, + { + "epoch": 1.5548961424332344, + "grad_norm": 0.5370877385139465, + "learning_rate": 1e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7676985263824463, + "num_tokens": 1255839967.0, + "step": 2620 + }, + { + "epoch": 1.5554896142433234, + "grad_norm": 0.5755411982536316, + "learning_rate": 1e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7649387121200562, + "num_tokens": 1256297681.0, + "step": 2621 + }, + { + "epoch": 1.5560830860534125, + "grad_norm": 0.5384711623191833, + "learning_rate": 1e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7637767791748047, + "num_tokens": 1256729033.0, + "step": 2622 + }, + { + "epoch": 1.5566765578635016, + "grad_norm": 0.5326809287071228, + "learning_rate": 1e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7698075771331787, + "num_tokens": 1257209845.0, + "step": 2623 + }, + { + "epoch": 1.5572700296735906, + "grad_norm": 0.5784183740615845, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7609658241271973, + "num_tokens": 1257656041.0, + "step": 2624 + }, + { + "epoch": 1.5578635014836797, + "grad_norm": 0.5035441517829895, + "learning_rate": 1e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7620041370391846, + "num_tokens": 1258184395.0, + "step": 2625 + }, + { + "epoch": 1.5584569732937685, + "grad_norm": 0.5395311713218689, + "learning_rate": 1e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7517831921577454, + "num_tokens": 1258654947.0, + "step": 2626 + }, + { + "epoch": 1.5590504451038576, + "grad_norm": 0.5332985520362854, + "learning_rate": 1e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7614238262176514, + "num_tokens": 1259130375.0, + "step": 2627 + }, + { + "epoch": 1.5596439169139464, + "grad_norm": 0.5538811683654785, + "learning_rate": 1e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7627928256988525, + "num_tokens": 1259604991.0, + "step": 2628 + }, + { + "epoch": 1.5602373887240355, + "grad_norm": 0.5275079011917114, + "learning_rate": 1e-06, + "loss": 0.7954, + "mean_token_accuracy": 0.7518222332000732, + "num_tokens": 1260107186.0, + "step": 2629 + }, + { + "epoch": 1.5608308605341246, + "grad_norm": 0.5398702621459961, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7755929231643677, + "num_tokens": 1260557928.0, + "step": 2630 + }, + { + "epoch": 1.5614243323442136, + "grad_norm": 0.5316545367240906, + "learning_rate": 1e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7748937606811523, + "num_tokens": 1261028271.0, + "step": 2631 + }, + { + "epoch": 1.5620178041543027, + "grad_norm": 0.5203627943992615, + "learning_rate": 1e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.7880675792694092, + "num_tokens": 1261526826.0, + "step": 2632 + }, + { + "epoch": 1.5626112759643918, + "grad_norm": 0.5473642349243164, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7745567560195923, + "num_tokens": 1261977991.0, + "step": 2633 + }, + { + "epoch": 1.5632047477744808, + "grad_norm": 0.5274827480316162, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7711364030838013, + "num_tokens": 1262499391.0, + "step": 2634 + }, + { + "epoch": 1.5637982195845699, + "grad_norm": 0.5417507290840149, + "learning_rate": 1e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7610933780670166, + "num_tokens": 1262983324.0, + "step": 2635 + }, + { + "epoch": 1.5643916913946587, + "grad_norm": 0.5702774524688721, + "learning_rate": 1e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7516012191772461, + "num_tokens": 1263436605.0, + "step": 2636 + }, + { + "epoch": 1.5649851632047478, + "grad_norm": 0.5457707047462463, + "learning_rate": 1e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.7849531173706055, + "num_tokens": 1263919640.0, + "step": 2637 + }, + { + "epoch": 1.5655786350148369, + "grad_norm": 0.5514423251152039, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7459482550621033, + "num_tokens": 1264431051.0, + "step": 2638 + }, + { + "epoch": 1.5661721068249257, + "grad_norm": 0.5601052641868591, + "learning_rate": 1e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7806107997894287, + "num_tokens": 1264836200.0, + "step": 2639 + }, + { + "epoch": 1.5667655786350148, + "grad_norm": 0.5580078363418579, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7613938450813293, + "num_tokens": 1265299587.0, + "step": 2640 + }, + { + "epoch": 1.5673590504451038, + "grad_norm": 0.5290590524673462, + "learning_rate": 1e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.7600390315055847, + "num_tokens": 1265777123.0, + "step": 2641 + }, + { + "epoch": 1.5679525222551929, + "grad_norm": 0.5233980417251587, + "learning_rate": 1e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7588219046592712, + "num_tokens": 1266264015.0, + "step": 2642 + }, + { + "epoch": 1.568545994065282, + "grad_norm": 0.5363311171531677, + "learning_rate": 1e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7537407875061035, + "num_tokens": 1266750161.0, + "step": 2643 + }, + { + "epoch": 1.569139465875371, + "grad_norm": 0.5261262059211731, + "learning_rate": 1e-06, + "loss": 0.7781, + "mean_token_accuracy": 0.7574079036712646, + "num_tokens": 1267258877.0, + "step": 2644 + }, + { + "epoch": 1.56973293768546, + "grad_norm": 0.5468514561653137, + "learning_rate": 1e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7656320333480835, + "num_tokens": 1267727413.0, + "step": 2645 + }, + { + "epoch": 1.570326409495549, + "grad_norm": 0.5613793730735779, + "learning_rate": 1e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.77272629737854, + "num_tokens": 1268160235.0, + "step": 2646 + }, + { + "epoch": 1.570919881305638, + "grad_norm": 0.5074893832206726, + "learning_rate": 1e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7740252614021301, + "num_tokens": 1268661308.0, + "step": 2647 + }, + { + "epoch": 1.571513353115727, + "grad_norm": 0.5697522759437561, + "learning_rate": 1e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7640852928161621, + "num_tokens": 1269113476.0, + "step": 2648 + }, + { + "epoch": 1.5721068249258159, + "grad_norm": 0.5702451467514038, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.754371702671051, + "num_tokens": 1269563507.0, + "step": 2649 + }, + { + "epoch": 1.572700296735905, + "grad_norm": 0.5609722137451172, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7683744430541992, + "num_tokens": 1270020874.0, + "step": 2650 + }, + { + "epoch": 1.573293768545994, + "grad_norm": 0.5637466907501221, + "learning_rate": 1e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7639449834823608, + "num_tokens": 1270519562.0, + "step": 2651 + }, + { + "epoch": 1.573887240356083, + "grad_norm": 0.5712355375289917, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.756206750869751, + "num_tokens": 1270991661.0, + "step": 2652 + }, + { + "epoch": 1.5744807121661721, + "grad_norm": 0.5442721247673035, + "learning_rate": 1e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.7713095545768738, + "num_tokens": 1271429523.0, + "step": 2653 + }, + { + "epoch": 1.5750741839762612, + "grad_norm": 0.5662513971328735, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7606910467147827, + "num_tokens": 1271907091.0, + "step": 2654 + }, + { + "epoch": 1.5756676557863503, + "grad_norm": 0.5781581997871399, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7669427990913391, + "num_tokens": 1272325531.0, + "step": 2655 + }, + { + "epoch": 1.5762611275964393, + "grad_norm": 0.542361855506897, + "learning_rate": 1e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7601444721221924, + "num_tokens": 1272824971.0, + "step": 2656 + }, + { + "epoch": 1.5768545994065282, + "grad_norm": 0.5498576164245605, + "learning_rate": 1e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7772724032402039, + "num_tokens": 1273287772.0, + "step": 2657 + }, + { + "epoch": 1.5774480712166172, + "grad_norm": 0.5592021942138672, + "learning_rate": 1e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7665202617645264, + "num_tokens": 1273750820.0, + "step": 2658 + }, + { + "epoch": 1.578041543026706, + "grad_norm": 0.5998704433441162, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7595645189285278, + "num_tokens": 1274161588.0, + "step": 2659 + }, + { + "epoch": 1.5786350148367951, + "grad_norm": 0.5252704620361328, + "learning_rate": 1e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7580789923667908, + "num_tokens": 1274634399.0, + "step": 2660 + }, + { + "epoch": 1.5792284866468842, + "grad_norm": 0.5612102150917053, + "learning_rate": 1e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7566276788711548, + "num_tokens": 1275104535.0, + "step": 2661 + }, + { + "epoch": 1.5798219584569733, + "grad_norm": 0.5775704979896545, + "learning_rate": 1e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7512960433959961, + "num_tokens": 1275581013.0, + "step": 2662 + }, + { + "epoch": 1.5804154302670623, + "grad_norm": 0.56618332862854, + "learning_rate": 1e-06, + "loss": 0.7621, + "mean_token_accuracy": 0.7617568969726562, + "num_tokens": 1276037490.0, + "step": 2663 + }, + { + "epoch": 1.5810089020771514, + "grad_norm": 0.5665500164031982, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7588685750961304, + "num_tokens": 1276481236.0, + "step": 2664 + }, + { + "epoch": 1.5816023738872405, + "grad_norm": 0.5266575813293457, + "learning_rate": 1e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7734366059303284, + "num_tokens": 1276964835.0, + "step": 2665 + }, + { + "epoch": 1.5821958456973295, + "grad_norm": 0.5458011627197266, + "learning_rate": 1e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7564713358879089, + "num_tokens": 1277422237.0, + "step": 2666 + }, + { + "epoch": 1.5827893175074184, + "grad_norm": 0.5548267364501953, + "learning_rate": 1e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7628968954086304, + "num_tokens": 1277916519.0, + "step": 2667 + }, + { + "epoch": 1.5833827893175074, + "grad_norm": 0.5152975916862488, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7673060297966003, + "num_tokens": 1278428299.0, + "step": 2668 + }, + { + "epoch": 1.5839762611275965, + "grad_norm": 0.525046706199646, + "learning_rate": 1e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7673935890197754, + "num_tokens": 1278894427.0, + "step": 2669 + }, + { + "epoch": 1.5845697329376853, + "grad_norm": 0.5542775988578796, + "learning_rate": 1e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7549811601638794, + "num_tokens": 1279376388.0, + "step": 2670 + }, + { + "epoch": 1.5851632047477744, + "grad_norm": 0.5391263961791992, + "learning_rate": 1e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7636376619338989, + "num_tokens": 1279879801.0, + "step": 2671 + }, + { + "epoch": 1.5857566765578635, + "grad_norm": 0.5610396862030029, + "learning_rate": 1e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.760425329208374, + "num_tokens": 1280355513.0, + "step": 2672 + }, + { + "epoch": 1.5863501483679525, + "grad_norm": 0.545296847820282, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7718721628189087, + "num_tokens": 1280821775.0, + "step": 2673 + }, + { + "epoch": 1.5869436201780416, + "grad_norm": 0.5854547023773193, + "learning_rate": 1e-06, + "loss": 0.6927, + "mean_token_accuracy": 0.7790708541870117, + "num_tokens": 1281293334.0, + "step": 2674 + }, + { + "epoch": 1.5875370919881306, + "grad_norm": 0.5677891969680786, + "learning_rate": 1e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.761295735836029, + "num_tokens": 1281770419.0, + "step": 2675 + }, + { + "epoch": 1.5881305637982197, + "grad_norm": 0.5442721247673035, + "learning_rate": 1e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.780120849609375, + "num_tokens": 1282249654.0, + "step": 2676 + }, + { + "epoch": 1.5887240356083088, + "grad_norm": 0.5962350964546204, + "learning_rate": 1e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.765365481376648, + "num_tokens": 1282713362.0, + "step": 2677 + }, + { + "epoch": 1.5893175074183976, + "grad_norm": 0.5720334649085999, + "learning_rate": 1e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7617271542549133, + "num_tokens": 1283166537.0, + "step": 2678 + }, + { + "epoch": 1.5899109792284867, + "grad_norm": 0.5633089542388916, + "learning_rate": 1e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.760973334312439, + "num_tokens": 1283663999.0, + "step": 2679 + }, + { + "epoch": 1.5905044510385755, + "grad_norm": 0.6175435185432434, + "learning_rate": 1e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7614173889160156, + "num_tokens": 1284075474.0, + "step": 2680 + }, + { + "epoch": 1.5910979228486646, + "grad_norm": 0.5826773047447205, + "learning_rate": 1e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.7554216384887695, + "num_tokens": 1284554618.0, + "step": 2681 + }, + { + "epoch": 1.5916913946587536, + "grad_norm": 0.5563210844993591, + "learning_rate": 1e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7638382315635681, + "num_tokens": 1285020060.0, + "step": 2682 + }, + { + "epoch": 1.5922848664688427, + "grad_norm": 0.5070822834968567, + "learning_rate": 1e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7809501886367798, + "num_tokens": 1285549055.0, + "step": 2683 + }, + { + "epoch": 1.5928783382789318, + "grad_norm": 0.5619015693664551, + "learning_rate": 1e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7722613215446472, + "num_tokens": 1286016118.0, + "step": 2684 + }, + { + "epoch": 1.5934718100890208, + "grad_norm": 0.5616094470024109, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.760342001914978, + "num_tokens": 1286481528.0, + "step": 2685 + }, + { + "epoch": 1.59406528189911, + "grad_norm": 0.5449289679527283, + "learning_rate": 1e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.7540212869644165, + "num_tokens": 1286953006.0, + "step": 2686 + }, + { + "epoch": 1.594658753709199, + "grad_norm": 0.5007840991020203, + "learning_rate": 1e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7759847044944763, + "num_tokens": 1287475688.0, + "step": 2687 + }, + { + "epoch": 1.5952522255192878, + "grad_norm": 0.5364909768104553, + "learning_rate": 1e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7725405693054199, + "num_tokens": 1287941789.0, + "step": 2688 + }, + { + "epoch": 1.5958456973293769, + "grad_norm": 0.5740484595298767, + "learning_rate": 1e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7802730798721313, + "num_tokens": 1288393845.0, + "step": 2689 + }, + { + "epoch": 1.596439169139466, + "grad_norm": 0.5422813296318054, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7681785225868225, + "num_tokens": 1288874308.0, + "step": 2690 + }, + { + "epoch": 1.5970326409495548, + "grad_norm": 0.5414063334465027, + "learning_rate": 1e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7595546841621399, + "num_tokens": 1289369866.0, + "step": 2691 + }, + { + "epoch": 1.5976261127596438, + "grad_norm": 0.5498613715171814, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7653045654296875, + "num_tokens": 1289860039.0, + "step": 2692 + }, + { + "epoch": 1.598219584569733, + "grad_norm": 0.5337958931922913, + "learning_rate": 1e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7642359733581543, + "num_tokens": 1290348273.0, + "step": 2693 + }, + { + "epoch": 1.598813056379822, + "grad_norm": 0.543213427066803, + "learning_rate": 1e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7790953516960144, + "num_tokens": 1290838023.0, + "step": 2694 + }, + { + "epoch": 1.599406528189911, + "grad_norm": 0.5492595434188843, + "learning_rate": 1e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7588484287261963, + "num_tokens": 1291323390.0, + "step": 2695 + }, + { + "epoch": 1.6, + "grad_norm": 0.557252049446106, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7698286175727844, + "num_tokens": 1291774885.0, + "step": 2696 + }, + { + "epoch": 1.6005934718100892, + "grad_norm": 0.5381953716278076, + "learning_rate": 1e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.771953284740448, + "num_tokens": 1292259945.0, + "step": 2697 + }, + { + "epoch": 1.601186943620178, + "grad_norm": 0.564189612865448, + "learning_rate": 1e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7705163955688477, + "num_tokens": 1292768272.0, + "step": 2698 + }, + { + "epoch": 1.601780415430267, + "grad_norm": 0.5371629595756531, + "learning_rate": 1e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7671794891357422, + "num_tokens": 1293301508.0, + "step": 2699 + }, + { + "epoch": 1.6023738872403561, + "grad_norm": 0.5364012718200684, + "learning_rate": 1e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.758211076259613, + "num_tokens": 1293813813.0, + "step": 2700 + }, + { + "epoch": 1.602967359050445, + "grad_norm": 0.6189882755279541, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7695114612579346, + "num_tokens": 1294254325.0, + "step": 2701 + }, + { + "epoch": 1.603560830860534, + "grad_norm": 0.5851093530654907, + "learning_rate": 1e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7697712182998657, + "num_tokens": 1294730262.0, + "step": 2702 + }, + { + "epoch": 1.604154302670623, + "grad_norm": 0.5325128436088562, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7547569870948792, + "num_tokens": 1295210214.0, + "step": 2703 + }, + { + "epoch": 1.6047477744807122, + "grad_norm": 0.5252569317817688, + "learning_rate": 1e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7677696943283081, + "num_tokens": 1295683021.0, + "step": 2704 + }, + { + "epoch": 1.6053412462908012, + "grad_norm": 0.5909011960029602, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7639906406402588, + "num_tokens": 1296172270.0, + "step": 2705 + }, + { + "epoch": 1.6059347181008903, + "grad_norm": 0.5404974818229675, + "learning_rate": 1e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7687810063362122, + "num_tokens": 1296692777.0, + "step": 2706 + }, + { + "epoch": 1.6065281899109793, + "grad_norm": 0.5088779330253601, + "learning_rate": 1e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7781018614768982, + "num_tokens": 1297178158.0, + "step": 2707 + }, + { + "epoch": 1.6071216617210684, + "grad_norm": 0.5416465997695923, + "learning_rate": 1e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7558668851852417, + "num_tokens": 1297639974.0, + "step": 2708 + }, + { + "epoch": 1.6077151335311572, + "grad_norm": 0.5632715821266174, + "learning_rate": 1e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.767474889755249, + "num_tokens": 1298114776.0, + "step": 2709 + }, + { + "epoch": 1.6083086053412463, + "grad_norm": 0.5471814274787903, + "learning_rate": 1e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.7751476168632507, + "num_tokens": 1298584008.0, + "step": 2710 + }, + { + "epoch": 1.6089020771513352, + "grad_norm": 0.5523050427436829, + "learning_rate": 1e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.7560388445854187, + "num_tokens": 1299040960.0, + "step": 2711 + }, + { + "epoch": 1.6094955489614242, + "grad_norm": 0.5373200178146362, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7640810608863831, + "num_tokens": 1299542309.0, + "step": 2712 + }, + { + "epoch": 1.6100890207715133, + "grad_norm": 0.5332396030426025, + "learning_rate": 1e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7740355134010315, + "num_tokens": 1300054213.0, + "step": 2713 + }, + { + "epoch": 1.6106824925816023, + "grad_norm": 0.526846706867218, + "learning_rate": 1e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7819931507110596, + "num_tokens": 1300542391.0, + "step": 2714 + }, + { + "epoch": 1.6112759643916914, + "grad_norm": 0.525079607963562, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7759867906570435, + "num_tokens": 1301027257.0, + "step": 2715 + }, + { + "epoch": 1.6118694362017805, + "grad_norm": 0.5433669686317444, + "learning_rate": 1e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7724345326423645, + "num_tokens": 1301470697.0, + "step": 2716 + }, + { + "epoch": 1.6124629080118695, + "grad_norm": 0.5444355607032776, + "learning_rate": 1e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.765051007270813, + "num_tokens": 1301920083.0, + "step": 2717 + }, + { + "epoch": 1.6130563798219586, + "grad_norm": 0.5486449599266052, + "learning_rate": 1e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.7564136981964111, + "num_tokens": 1302379285.0, + "step": 2718 + }, + { + "epoch": 1.6136498516320474, + "grad_norm": 0.5236811637878418, + "learning_rate": 1e-06, + "loss": 0.742, + "mean_token_accuracy": 0.764545202255249, + "num_tokens": 1302886422.0, + "step": 2719 + }, + { + "epoch": 1.6142433234421365, + "grad_norm": 0.5595813989639282, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7691961526870728, + "num_tokens": 1303333626.0, + "step": 2720 + }, + { + "epoch": 1.6148367952522256, + "grad_norm": 0.5328021049499512, + "learning_rate": 1e-06, + "loss": 0.6659, + "mean_token_accuracy": 0.7849034070968628, + "num_tokens": 1303808957.0, + "step": 2721 + }, + { + "epoch": 1.6154302670623144, + "grad_norm": 0.546722948551178, + "learning_rate": 1e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.7876155972480774, + "num_tokens": 1304278335.0, + "step": 2722 + }, + { + "epoch": 1.6160237388724035, + "grad_norm": 0.5498180985450745, + "learning_rate": 1e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7577292919158936, + "num_tokens": 1304747946.0, + "step": 2723 + }, + { + "epoch": 1.6166172106824925, + "grad_norm": 0.5299561023712158, + "learning_rate": 1e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7596477270126343, + "num_tokens": 1305227256.0, + "step": 2724 + }, + { + "epoch": 1.6172106824925816, + "grad_norm": 0.5599551796913147, + "learning_rate": 1e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7658953666687012, + "num_tokens": 1305647507.0, + "step": 2725 + }, + { + "epoch": 1.6178041543026707, + "grad_norm": 0.5443812608718872, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7675517797470093, + "num_tokens": 1306125353.0, + "step": 2726 + }, + { + "epoch": 1.6183976261127597, + "grad_norm": 0.5695729851722717, + "learning_rate": 1e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7517372965812683, + "num_tokens": 1306579740.0, + "step": 2727 + }, + { + "epoch": 1.6189910979228488, + "grad_norm": 0.5702965259552002, + "learning_rate": 1e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7729812264442444, + "num_tokens": 1307029233.0, + "step": 2728 + }, + { + "epoch": 1.6195845697329379, + "grad_norm": 0.5329123735427856, + "learning_rate": 1e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7742869853973389, + "num_tokens": 1307516140.0, + "step": 2729 + }, + { + "epoch": 1.6201780415430267, + "grad_norm": 0.5403834581375122, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7677712440490723, + "num_tokens": 1308007286.0, + "step": 2730 + }, + { + "epoch": 1.6207715133531158, + "grad_norm": 0.5500433444976807, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7647705078125, + "num_tokens": 1308477316.0, + "step": 2731 + }, + { + "epoch": 1.6213649851632046, + "grad_norm": 0.5430676341056824, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7702621221542358, + "num_tokens": 1309013925.0, + "step": 2732 + }, + { + "epoch": 1.6219584569732937, + "grad_norm": 0.5473726987838745, + "learning_rate": 1e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.762249231338501, + "num_tokens": 1309498132.0, + "step": 2733 + }, + { + "epoch": 1.6225519287833827, + "grad_norm": 0.5610039830207825, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7624244689941406, + "num_tokens": 1310020530.0, + "step": 2734 + }, + { + "epoch": 1.6231454005934718, + "grad_norm": 0.6111377477645874, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7727358341217041, + "num_tokens": 1310451109.0, + "step": 2735 + }, + { + "epoch": 1.6237388724035609, + "grad_norm": 0.5505991578102112, + "learning_rate": 1e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7801780700683594, + "num_tokens": 1310910521.0, + "step": 2736 + }, + { + "epoch": 1.62433234421365, + "grad_norm": 0.5236226916313171, + "learning_rate": 1e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.7812397480010986, + "num_tokens": 1311386627.0, + "step": 2737 + }, + { + "epoch": 1.624925816023739, + "grad_norm": 0.5656935572624207, + "learning_rate": 1e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7760984897613525, + "num_tokens": 1311838158.0, + "step": 2738 + }, + { + "epoch": 1.625519287833828, + "grad_norm": 0.5683273673057556, + "learning_rate": 1e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7660858631134033, + "num_tokens": 1312286102.0, + "step": 2739 + }, + { + "epoch": 1.6261127596439169, + "grad_norm": 0.5662756562232971, + "learning_rate": 1e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7452989220619202, + "num_tokens": 1312759460.0, + "step": 2740 + }, + { + "epoch": 1.626706231454006, + "grad_norm": 0.5187019109725952, + "learning_rate": 1e-06, + "loss": 0.6932, + "mean_token_accuracy": 0.7803640961647034, + "num_tokens": 1313225558.0, + "step": 2741 + }, + { + "epoch": 1.627299703264095, + "grad_norm": 0.5420846939086914, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7717185616493225, + "num_tokens": 1313701105.0, + "step": 2742 + }, + { + "epoch": 1.6278931750741839, + "grad_norm": 0.5254269242286682, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.751184344291687, + "num_tokens": 1314226928.0, + "step": 2743 + }, + { + "epoch": 1.628486646884273, + "grad_norm": 0.5101087689399719, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7710524201393127, + "num_tokens": 1314764211.0, + "step": 2744 + }, + { + "epoch": 1.629080118694362, + "grad_norm": 0.5429813265800476, + "learning_rate": 1e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7628414630889893, + "num_tokens": 1315285432.0, + "step": 2745 + }, + { + "epoch": 1.629673590504451, + "grad_norm": 0.5660340785980225, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7696809768676758, + "num_tokens": 1315742390.0, + "step": 2746 + }, + { + "epoch": 1.63026706231454, + "grad_norm": 0.5882132649421692, + "learning_rate": 1e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.772924542427063, + "num_tokens": 1316189338.0, + "step": 2747 + }, + { + "epoch": 1.6308605341246292, + "grad_norm": 0.5820072889328003, + "learning_rate": 1e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7694939970970154, + "num_tokens": 1316639030.0, + "step": 2748 + }, + { + "epoch": 1.6314540059347182, + "grad_norm": 0.5761777758598328, + "learning_rate": 1e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7729676961898804, + "num_tokens": 1317115829.0, + "step": 2749 + }, + { + "epoch": 1.632047477744807, + "grad_norm": 0.552448570728302, + "learning_rate": 1e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7632004022598267, + "num_tokens": 1317567480.0, + "step": 2750 + }, + { + "epoch": 1.6326409495548961, + "grad_norm": 0.5222612023353577, + "learning_rate": 1e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.776930034160614, + "num_tokens": 1318064984.0, + "step": 2751 + }, + { + "epoch": 1.6332344213649852, + "grad_norm": 0.5414539575576782, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7676683664321899, + "num_tokens": 1318557114.0, + "step": 2752 + }, + { + "epoch": 1.633827893175074, + "grad_norm": 0.5582892298698425, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7589554786682129, + "num_tokens": 1319033078.0, + "step": 2753 + }, + { + "epoch": 1.634421364985163, + "grad_norm": 0.54595547914505, + "learning_rate": 1e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7673496007919312, + "num_tokens": 1319500767.0, + "step": 2754 + }, + { + "epoch": 1.6350148367952522, + "grad_norm": 0.5718028545379639, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.772618293762207, + "num_tokens": 1319951143.0, + "step": 2755 + }, + { + "epoch": 1.6356083086053412, + "grad_norm": 0.5056583285331726, + "learning_rate": 1e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7699005007743835, + "num_tokens": 1320475864.0, + "step": 2756 + }, + { + "epoch": 1.6362017804154303, + "grad_norm": 0.5335974097251892, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7542107105255127, + "num_tokens": 1320963063.0, + "step": 2757 + }, + { + "epoch": 1.6367952522255194, + "grad_norm": 0.5655766725540161, + "learning_rate": 1e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7658107280731201, + "num_tokens": 1321394792.0, + "step": 2758 + }, + { + "epoch": 1.6373887240356084, + "grad_norm": 0.5371735095977783, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.771037757396698, + "num_tokens": 1321904297.0, + "step": 2759 + }, + { + "epoch": 1.6379821958456975, + "grad_norm": 0.5407198071479797, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7595042586326599, + "num_tokens": 1322387947.0, + "step": 2760 + }, + { + "epoch": 1.6385756676557863, + "grad_norm": 0.5760083198547363, + "learning_rate": 1e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7470688819885254, + "num_tokens": 1322870864.0, + "step": 2761 + }, + { + "epoch": 1.6391691394658754, + "grad_norm": 0.5374981164932251, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7675942182540894, + "num_tokens": 1323366410.0, + "step": 2762 + }, + { + "epoch": 1.6397626112759642, + "grad_norm": 0.5234959125518799, + "learning_rate": 1e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7632771134376526, + "num_tokens": 1323862547.0, + "step": 2763 + }, + { + "epoch": 1.6403560830860533, + "grad_norm": 0.5216617584228516, + "learning_rate": 1e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7650893926620483, + "num_tokens": 1324353000.0, + "step": 2764 + }, + { + "epoch": 1.6409495548961424, + "grad_norm": 0.5497813820838928, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7585567235946655, + "num_tokens": 1324818575.0, + "step": 2765 + }, + { + "epoch": 1.6415430267062314, + "grad_norm": 0.5219447612762451, + "learning_rate": 1e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7750723958015442, + "num_tokens": 1325356685.0, + "step": 2766 + }, + { + "epoch": 1.6421364985163205, + "grad_norm": 0.5514470338821411, + "learning_rate": 1e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7479089498519897, + "num_tokens": 1325824615.0, + "step": 2767 + }, + { + "epoch": 1.6427299703264095, + "grad_norm": 0.530724048614502, + "learning_rate": 1e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7759705185890198, + "num_tokens": 1326290300.0, + "step": 2768 + }, + { + "epoch": 1.6433234421364986, + "grad_norm": 0.5165941119194031, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.771021842956543, + "num_tokens": 1326789704.0, + "step": 2769 + }, + { + "epoch": 1.6439169139465877, + "grad_norm": 0.5529154539108276, + "learning_rate": 1e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7583346366882324, + "num_tokens": 1327318671.0, + "step": 2770 + }, + { + "epoch": 1.6445103857566765, + "grad_norm": 0.5817209482192993, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7736080884933472, + "num_tokens": 1327791833.0, + "step": 2771 + }, + { + "epoch": 1.6451038575667656, + "grad_norm": 0.5503882765769958, + "learning_rate": 1e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7664919495582581, + "num_tokens": 1328269290.0, + "step": 2772 + }, + { + "epoch": 1.6456973293768546, + "grad_norm": 0.5722193121910095, + "learning_rate": 1e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.775020956993103, + "num_tokens": 1328732809.0, + "step": 2773 + }, + { + "epoch": 1.6462908011869435, + "grad_norm": 0.5740134119987488, + "learning_rate": 1e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7564058303833008, + "num_tokens": 1329171910.0, + "step": 2774 + }, + { + "epoch": 1.6468842729970326, + "grad_norm": 0.5700508952140808, + "learning_rate": 1e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7615492343902588, + "num_tokens": 1329625101.0, + "step": 2775 + }, + { + "epoch": 1.6474777448071216, + "grad_norm": 0.5137071013450623, + "learning_rate": 1e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7773475050926208, + "num_tokens": 1330152222.0, + "step": 2776 + }, + { + "epoch": 1.6480712166172107, + "grad_norm": 0.5350685119628906, + "learning_rate": 1e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7735699415206909, + "num_tokens": 1330648024.0, + "step": 2777 + }, + { + "epoch": 1.6486646884272997, + "grad_norm": 0.511142373085022, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7621638774871826, + "num_tokens": 1331172594.0, + "step": 2778 + }, + { + "epoch": 1.6492581602373888, + "grad_norm": 0.5569871068000793, + "learning_rate": 1e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7725971341133118, + "num_tokens": 1331634207.0, + "step": 2779 + }, + { + "epoch": 1.6498516320474779, + "grad_norm": 0.5581631064414978, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7683435678482056, + "num_tokens": 1332109721.0, + "step": 2780 + }, + { + "epoch": 1.650445103857567, + "grad_norm": 0.5582797527313232, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7701215147972107, + "num_tokens": 1332618129.0, + "step": 2781 + }, + { + "epoch": 1.6510385756676558, + "grad_norm": 0.5778260827064514, + "learning_rate": 1e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7713531255722046, + "num_tokens": 1333076506.0, + "step": 2782 + }, + { + "epoch": 1.6516320474777448, + "grad_norm": 0.5667759776115417, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7753002643585205, + "num_tokens": 1333573400.0, + "step": 2783 + }, + { + "epoch": 1.6522255192878337, + "grad_norm": 0.509402334690094, + "learning_rate": 1e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7719569802284241, + "num_tokens": 1334088872.0, + "step": 2784 + }, + { + "epoch": 1.6528189910979227, + "grad_norm": 0.5571190714836121, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7641499042510986, + "num_tokens": 1334535446.0, + "step": 2785 + }, + { + "epoch": 1.6534124629080118, + "grad_norm": 0.5401875972747803, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.749854326248169, + "num_tokens": 1335042629.0, + "step": 2786 + }, + { + "epoch": 1.6540059347181009, + "grad_norm": 0.5765283107757568, + "learning_rate": 1e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.755545437335968, + "num_tokens": 1335496748.0, + "step": 2787 + }, + { + "epoch": 1.65459940652819, + "grad_norm": 0.5466844439506531, + "learning_rate": 1e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7669561505317688, + "num_tokens": 1335970149.0, + "step": 2788 + }, + { + "epoch": 1.655192878338279, + "grad_norm": 0.5879860520362854, + "learning_rate": 1e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.764936089515686, + "num_tokens": 1336412705.0, + "step": 2789 + }, + { + "epoch": 1.655786350148368, + "grad_norm": 0.5465027093887329, + "learning_rate": 1e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7713994383811951, + "num_tokens": 1336879302.0, + "step": 2790 + }, + { + "epoch": 1.6563798219584571, + "grad_norm": 0.538235604763031, + "learning_rate": 1e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7824434638023376, + "num_tokens": 1337389266.0, + "step": 2791 + }, + { + "epoch": 1.656973293768546, + "grad_norm": 0.5233417749404907, + "learning_rate": 1e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.7737836241722107, + "num_tokens": 1337874031.0, + "step": 2792 + }, + { + "epoch": 1.657566765578635, + "grad_norm": 0.5591315031051636, + "learning_rate": 1e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7621907591819763, + "num_tokens": 1338355773.0, + "step": 2793 + }, + { + "epoch": 1.658160237388724, + "grad_norm": 0.5611224174499512, + "learning_rate": 1e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7711851596832275, + "num_tokens": 1338819775.0, + "step": 2794 + }, + { + "epoch": 1.658753709198813, + "grad_norm": 0.5247663855552673, + "learning_rate": 1e-06, + "loss": 0.72, + "mean_token_accuracy": 0.7731274962425232, + "num_tokens": 1339312499.0, + "step": 2795 + }, + { + "epoch": 1.659347181008902, + "grad_norm": 0.555913507938385, + "learning_rate": 1e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7560749053955078, + "num_tokens": 1339770638.0, + "step": 2796 + }, + { + "epoch": 1.659940652818991, + "grad_norm": 0.5116640329360962, + "learning_rate": 1e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7497419118881226, + "num_tokens": 1340282991.0, + "step": 2797 + }, + { + "epoch": 1.6605341246290801, + "grad_norm": 0.5489585995674133, + "learning_rate": 1e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7728151082992554, + "num_tokens": 1340748595.0, + "step": 2798 + }, + { + "epoch": 1.6611275964391692, + "grad_norm": 0.5329060554504395, + "learning_rate": 1e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7745463848114014, + "num_tokens": 1341207227.0, + "step": 2799 + }, + { + "epoch": 1.6617210682492582, + "grad_norm": 0.5359109044075012, + "learning_rate": 1e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.7830241918563843, + "num_tokens": 1341713077.0, + "step": 2800 + }, + { + "epoch": 1.6623145400593473, + "grad_norm": 0.5581989288330078, + "learning_rate": 1e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7607522010803223, + "num_tokens": 1342148597.0, + "step": 2801 + }, + { + "epoch": 1.6629080118694362, + "grad_norm": 0.524518609046936, + "learning_rate": 1e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7690269947052002, + "num_tokens": 1342619846.0, + "step": 2802 + }, + { + "epoch": 1.6635014836795252, + "grad_norm": 0.5582440495491028, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.754558801651001, + "num_tokens": 1343092063.0, + "step": 2803 + }, + { + "epoch": 1.6640949554896143, + "grad_norm": 0.533340573310852, + "learning_rate": 1e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.7577023506164551, + "num_tokens": 1343596926.0, + "step": 2804 + }, + { + "epoch": 1.6646884272997031, + "grad_norm": 0.5574774146080017, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.764876127243042, + "num_tokens": 1344049723.0, + "step": 2805 + }, + { + "epoch": 1.6652818991097922, + "grad_norm": 0.5394800305366516, + "learning_rate": 1e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7615729570388794, + "num_tokens": 1344518997.0, + "step": 2806 + }, + { + "epoch": 1.6658753709198812, + "grad_norm": 0.574664294719696, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7722213268280029, + "num_tokens": 1345001293.0, + "step": 2807 + }, + { + "epoch": 1.6664688427299703, + "grad_norm": 0.5704523324966431, + "learning_rate": 1e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7566363215446472, + "num_tokens": 1345481373.0, + "step": 2808 + }, + { + "epoch": 1.6670623145400594, + "grad_norm": 0.5481070876121521, + "learning_rate": 1e-06, + "loss": 0.75, + "mean_token_accuracy": 0.761941134929657, + "num_tokens": 1346019321.0, + "step": 2809 + }, + { + "epoch": 1.6676557863501484, + "grad_norm": 0.5354712605476379, + "learning_rate": 1e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.7651752829551697, + "num_tokens": 1346533561.0, + "step": 2810 + }, + { + "epoch": 1.6682492581602375, + "grad_norm": 0.49970921874046326, + "learning_rate": 1e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.7725441455841064, + "num_tokens": 1347054525.0, + "step": 2811 + }, + { + "epoch": 1.6688427299703266, + "grad_norm": 0.5347199440002441, + "learning_rate": 1e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7682981491088867, + "num_tokens": 1347576851.0, + "step": 2812 + }, + { + "epoch": 1.6694362017804154, + "grad_norm": 0.5759356021881104, + "learning_rate": 1e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.765671968460083, + "num_tokens": 1348056865.0, + "step": 2813 + }, + { + "epoch": 1.6700296735905045, + "grad_norm": 0.5646697282791138, + "learning_rate": 1e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7690160274505615, + "num_tokens": 1348516311.0, + "step": 2814 + }, + { + "epoch": 1.6706231454005933, + "grad_norm": 0.5642701387405396, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7652549743652344, + "num_tokens": 1348982349.0, + "step": 2815 + }, + { + "epoch": 1.6712166172106824, + "grad_norm": 0.5290924906730652, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7572306394577026, + "num_tokens": 1349493581.0, + "step": 2816 + }, + { + "epoch": 1.6718100890207714, + "grad_norm": 0.5565234422683716, + "learning_rate": 1e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7736340165138245, + "num_tokens": 1349949137.0, + "step": 2817 + }, + { + "epoch": 1.6724035608308605, + "grad_norm": 0.5707710981369019, + "learning_rate": 1e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7537784576416016, + "num_tokens": 1350411789.0, + "step": 2818 + }, + { + "epoch": 1.6729970326409496, + "grad_norm": 0.5541338920593262, + "learning_rate": 1e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7697779536247253, + "num_tokens": 1350887100.0, + "step": 2819 + }, + { + "epoch": 1.6735905044510386, + "grad_norm": 0.5377499461174011, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7662673592567444, + "num_tokens": 1351386422.0, + "step": 2820 + }, + { + "epoch": 1.6741839762611277, + "grad_norm": 0.5313231945037842, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7743548154830933, + "num_tokens": 1351845754.0, + "step": 2821 + }, + { + "epoch": 1.6747774480712168, + "grad_norm": 0.5448083281517029, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7515393495559692, + "num_tokens": 1352327783.0, + "step": 2822 + }, + { + "epoch": 1.6753709198813056, + "grad_norm": 0.5544896721839905, + "learning_rate": 1e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7580995559692383, + "num_tokens": 1352760952.0, + "step": 2823 + }, + { + "epoch": 1.6759643916913947, + "grad_norm": 0.5498589277267456, + "learning_rate": 1e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7635077834129333, + "num_tokens": 1353221890.0, + "step": 2824 + }, + { + "epoch": 1.6765578635014837, + "grad_norm": 0.5465884804725647, + "learning_rate": 1e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7643330097198486, + "num_tokens": 1353707373.0, + "step": 2825 + }, + { + "epoch": 1.6771513353115726, + "grad_norm": 0.5104246735572815, + "learning_rate": 1e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7600594758987427, + "num_tokens": 1354239065.0, + "step": 2826 + }, + { + "epoch": 1.6777448071216616, + "grad_norm": 0.5176457166671753, + "learning_rate": 1e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7623393535614014, + "num_tokens": 1354721641.0, + "step": 2827 + }, + { + "epoch": 1.6783382789317507, + "grad_norm": 0.5009331107139587, + "learning_rate": 1e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.774075448513031, + "num_tokens": 1355241627.0, + "step": 2828 + }, + { + "epoch": 1.6789317507418398, + "grad_norm": 0.5542586445808411, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7574248909950256, + "num_tokens": 1355736877.0, + "step": 2829 + }, + { + "epoch": 1.6795252225519288, + "grad_norm": 0.5433722734451294, + "learning_rate": 1e-06, + "loss": 0.7449, + "mean_token_accuracy": 0.7659082412719727, + "num_tokens": 1356220584.0, + "step": 2830 + }, + { + "epoch": 1.6801186943620179, + "grad_norm": 0.538232147693634, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7647392749786377, + "num_tokens": 1356725267.0, + "step": 2831 + }, + { + "epoch": 1.680712166172107, + "grad_norm": 0.5731938481330872, + "learning_rate": 1e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7599485516548157, + "num_tokens": 1357164199.0, + "step": 2832 + }, + { + "epoch": 1.681305637982196, + "grad_norm": 0.5463622808456421, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7697113752365112, + "num_tokens": 1357627071.0, + "step": 2833 + }, + { + "epoch": 1.6818991097922849, + "grad_norm": 0.5409271121025085, + "learning_rate": 1e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7598168849945068, + "num_tokens": 1358095440.0, + "step": 2834 + }, + { + "epoch": 1.682492581602374, + "grad_norm": 0.548525869846344, + "learning_rate": 1e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.7540870308876038, + "num_tokens": 1358566701.0, + "step": 2835 + }, + { + "epoch": 1.6830860534124628, + "grad_norm": 0.5650437474250793, + "learning_rate": 1e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.7680264711380005, + "num_tokens": 1359068359.0, + "step": 2836 + }, + { + "epoch": 1.6836795252225518, + "grad_norm": 0.5434325337409973, + "learning_rate": 1e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7712035179138184, + "num_tokens": 1359543214.0, + "step": 2837 + }, + { + "epoch": 1.6842729970326409, + "grad_norm": 0.5349525213241577, + "learning_rate": 1e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7737130522727966, + "num_tokens": 1360063611.0, + "step": 2838 + }, + { + "epoch": 1.68486646884273, + "grad_norm": 0.5455388426780701, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7591256499290466, + "num_tokens": 1360559806.0, + "step": 2839 + }, + { + "epoch": 1.685459940652819, + "grad_norm": 0.5356411933898926, + "learning_rate": 1e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7656432390213013, + "num_tokens": 1361077886.0, + "step": 2840 + }, + { + "epoch": 1.686053412462908, + "grad_norm": 0.5645689964294434, + "learning_rate": 1e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7632119655609131, + "num_tokens": 1361534114.0, + "step": 2841 + }, + { + "epoch": 1.6866468842729971, + "grad_norm": 0.5624204277992249, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7660292387008667, + "num_tokens": 1361977843.0, + "step": 2842 + }, + { + "epoch": 1.6872403560830862, + "grad_norm": 0.5439847707748413, + "learning_rate": 1e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7655264735221863, + "num_tokens": 1362447047.0, + "step": 2843 + }, + { + "epoch": 1.687833827893175, + "grad_norm": 0.5508766770362854, + "learning_rate": 1e-06, + "loss": 0.762, + "mean_token_accuracy": 0.757870078086853, + "num_tokens": 1362923960.0, + "step": 2844 + }, + { + "epoch": 1.688427299703264, + "grad_norm": 0.5586474537849426, + "learning_rate": 1e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7685339450836182, + "num_tokens": 1363388459.0, + "step": 2845 + }, + { + "epoch": 1.6890207715133532, + "grad_norm": 0.5364988446235657, + "learning_rate": 1e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7638350129127502, + "num_tokens": 1363879396.0, + "step": 2846 + }, + { + "epoch": 1.689614243323442, + "grad_norm": 0.5392016768455505, + "learning_rate": 1e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7510835528373718, + "num_tokens": 1364369905.0, + "step": 2847 + }, + { + "epoch": 1.690207715133531, + "grad_norm": 0.5269027352333069, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7658060789108276, + "num_tokens": 1364814309.0, + "step": 2848 + }, + { + "epoch": 1.6908011869436201, + "grad_norm": 0.5569730401039124, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7719899415969849, + "num_tokens": 1365306846.0, + "step": 2849 + }, + { + "epoch": 1.6913946587537092, + "grad_norm": 0.5367538928985596, + "learning_rate": 1e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7676200866699219, + "num_tokens": 1365831213.0, + "step": 2850 + }, + { + "epoch": 1.6919881305637983, + "grad_norm": 0.5575068593025208, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.767488956451416, + "num_tokens": 1366322948.0, + "step": 2851 + }, + { + "epoch": 1.6925816023738873, + "grad_norm": 0.6201173067092896, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7465351819992065, + "num_tokens": 1366727097.0, + "step": 2852 + }, + { + "epoch": 1.6931750741839764, + "grad_norm": 0.5393926501274109, + "learning_rate": 1e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7545297145843506, + "num_tokens": 1367253544.0, + "step": 2853 + }, + { + "epoch": 1.6937685459940652, + "grad_norm": 0.5426279902458191, + "learning_rate": 1e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.776954174041748, + "num_tokens": 1367741070.0, + "step": 2854 + }, + { + "epoch": 1.6943620178041543, + "grad_norm": 0.532373309135437, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7737574577331543, + "num_tokens": 1368235729.0, + "step": 2855 + }, + { + "epoch": 1.6949554896142434, + "grad_norm": 0.5253651142120361, + "learning_rate": 1e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7557851672172546, + "num_tokens": 1368688339.0, + "step": 2856 + }, + { + "epoch": 1.6955489614243322, + "grad_norm": 0.5100418925285339, + "learning_rate": 1e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.765056848526001, + "num_tokens": 1369207813.0, + "step": 2857 + }, + { + "epoch": 1.6961424332344213, + "grad_norm": 0.5438231229782104, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7551892399787903, + "num_tokens": 1369713715.0, + "step": 2858 + }, + { + "epoch": 1.6967359050445103, + "grad_norm": 0.5373634099960327, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7687567472457886, + "num_tokens": 1370187079.0, + "step": 2859 + }, + { + "epoch": 1.6973293768545994, + "grad_norm": 0.5101094245910645, + "learning_rate": 1e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7695555686950684, + "num_tokens": 1370704444.0, + "step": 2860 + }, + { + "epoch": 1.6979228486646885, + "grad_norm": 0.5519542098045349, + "learning_rate": 1e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7738268375396729, + "num_tokens": 1371192168.0, + "step": 2861 + }, + { + "epoch": 1.6985163204747775, + "grad_norm": 0.5591471791267395, + "learning_rate": 1e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7598094940185547, + "num_tokens": 1371660917.0, + "step": 2862 + }, + { + "epoch": 1.6991097922848666, + "grad_norm": 0.5401105880737305, + "learning_rate": 1e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.7719568610191345, + "num_tokens": 1372154773.0, + "step": 2863 + }, + { + "epoch": 1.6997032640949556, + "grad_norm": 0.5472722053527832, + "learning_rate": 1e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.780632734298706, + "num_tokens": 1372617862.0, + "step": 2864 + }, + { + "epoch": 1.7002967359050445, + "grad_norm": 0.5192298889160156, + "learning_rate": 1e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7632215023040771, + "num_tokens": 1373112840.0, + "step": 2865 + }, + { + "epoch": 1.7008902077151336, + "grad_norm": 0.544945478439331, + "learning_rate": 1e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7773805856704712, + "num_tokens": 1373619072.0, + "step": 2866 + }, + { + "epoch": 1.7014836795252224, + "grad_norm": 0.5595630407333374, + "learning_rate": 1e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7686710357666016, + "num_tokens": 1374077085.0, + "step": 2867 + }, + { + "epoch": 1.7020771513353115, + "grad_norm": 0.5366166830062866, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.74399733543396, + "num_tokens": 1374562300.0, + "step": 2868 + }, + { + "epoch": 1.7026706231454005, + "grad_norm": 0.5738952159881592, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7642298936843872, + "num_tokens": 1375005397.0, + "step": 2869 + }, + { + "epoch": 1.7032640949554896, + "grad_norm": 0.554161787033081, + "learning_rate": 1e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7623088955879211, + "num_tokens": 1375482947.0, + "step": 2870 + }, + { + "epoch": 1.7038575667655786, + "grad_norm": 0.5607578158378601, + "learning_rate": 1e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7672851085662842, + "num_tokens": 1375944318.0, + "step": 2871 + }, + { + "epoch": 1.7044510385756677, + "grad_norm": 0.5754978656768799, + "learning_rate": 1e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7672871351242065, + "num_tokens": 1376440010.0, + "step": 2872 + }, + { + "epoch": 1.7050445103857568, + "grad_norm": 0.5258631706237793, + "learning_rate": 1e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7748433351516724, + "num_tokens": 1376941625.0, + "step": 2873 + }, + { + "epoch": 1.7056379821958458, + "grad_norm": 0.52195143699646, + "learning_rate": 1e-06, + "loss": 0.6807, + "mean_token_accuracy": 0.7821827530860901, + "num_tokens": 1377409336.0, + "step": 2874 + }, + { + "epoch": 1.7062314540059347, + "grad_norm": 0.5390281677246094, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7658142447471619, + "num_tokens": 1377877829.0, + "step": 2875 + }, + { + "epoch": 1.7068249258160237, + "grad_norm": 0.5616140365600586, + "learning_rate": 1e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7624745965003967, + "num_tokens": 1378355540.0, + "step": 2876 + }, + { + "epoch": 1.7074183976261128, + "grad_norm": 0.5085721611976624, + "learning_rate": 1e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7742891311645508, + "num_tokens": 1378872201.0, + "step": 2877 + }, + { + "epoch": 1.7080118694362016, + "grad_norm": 0.5151790976524353, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7667359113693237, + "num_tokens": 1379376039.0, + "step": 2878 + }, + { + "epoch": 1.7086053412462907, + "grad_norm": 0.587086021900177, + "learning_rate": 1e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.754328727722168, + "num_tokens": 1379804419.0, + "step": 2879 + }, + { + "epoch": 1.7091988130563798, + "grad_norm": 0.5728938579559326, + "learning_rate": 1e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7526810169219971, + "num_tokens": 1380249260.0, + "step": 2880 + }, + { + "epoch": 1.7097922848664688, + "grad_norm": 0.5554075837135315, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7701424360275269, + "num_tokens": 1380695260.0, + "step": 2881 + }, + { + "epoch": 1.710385756676558, + "grad_norm": 0.5405480861663818, + "learning_rate": 1e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7725854516029358, + "num_tokens": 1381174323.0, + "step": 2882 + }, + { + "epoch": 1.710979228486647, + "grad_norm": 0.5110522508621216, + "learning_rate": 1e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7782223224639893, + "num_tokens": 1381695283.0, + "step": 2883 + }, + { + "epoch": 1.711572700296736, + "grad_norm": 0.5532545447349548, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7677348852157593, + "num_tokens": 1382156386.0, + "step": 2884 + }, + { + "epoch": 1.712166172106825, + "grad_norm": 0.5462065935134888, + "learning_rate": 1e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7680596113204956, + "num_tokens": 1382614607.0, + "step": 2885 + }, + { + "epoch": 1.712759643916914, + "grad_norm": 0.5352627038955688, + "learning_rate": 1e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7746183276176453, + "num_tokens": 1383123668.0, + "step": 2886 + }, + { + "epoch": 1.713353115727003, + "grad_norm": 0.5575618743896484, + "learning_rate": 1e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.75943922996521, + "num_tokens": 1383587075.0, + "step": 2887 + }, + { + "epoch": 1.7139465875370918, + "grad_norm": 0.5302389860153198, + "learning_rate": 1e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7775288224220276, + "num_tokens": 1384079860.0, + "step": 2888 + }, + { + "epoch": 1.714540059347181, + "grad_norm": 0.537580668926239, + "learning_rate": 1e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.7690922617912292, + "num_tokens": 1384558002.0, + "step": 2889 + }, + { + "epoch": 1.71513353115727, + "grad_norm": 0.5276663303375244, + "learning_rate": 1e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.752144992351532, + "num_tokens": 1385032562.0, + "step": 2890 + }, + { + "epoch": 1.715727002967359, + "grad_norm": 0.534262478351593, + "learning_rate": 1e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7795418500900269, + "num_tokens": 1385491922.0, + "step": 2891 + }, + { + "epoch": 1.716320474777448, + "grad_norm": 0.5342673659324646, + "learning_rate": 1e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7592108249664307, + "num_tokens": 1385963194.0, + "step": 2892 + }, + { + "epoch": 1.7169139465875372, + "grad_norm": 0.5414173007011414, + "learning_rate": 1e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7510409951210022, + "num_tokens": 1386477224.0, + "step": 2893 + }, + { + "epoch": 1.7175074183976262, + "grad_norm": 0.5690497756004333, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7437864542007446, + "num_tokens": 1386920449.0, + "step": 2894 + }, + { + "epoch": 1.7181008902077153, + "grad_norm": 0.5101307034492493, + "learning_rate": 1e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7581582069396973, + "num_tokens": 1387441266.0, + "step": 2895 + }, + { + "epoch": 1.7186943620178041, + "grad_norm": 0.5254700779914856, + "learning_rate": 1e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7505462765693665, + "num_tokens": 1387950624.0, + "step": 2896 + }, + { + "epoch": 1.7192878338278932, + "grad_norm": 0.595696210861206, + "learning_rate": 1e-06, + "loss": 0.7428, + "mean_token_accuracy": 0.7661734819412231, + "num_tokens": 1388427464.0, + "step": 2897 + }, + { + "epoch": 1.7198813056379822, + "grad_norm": 0.5551173090934753, + "learning_rate": 1e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.773753821849823, + "num_tokens": 1388883778.0, + "step": 2898 + }, + { + "epoch": 1.720474777448071, + "grad_norm": 0.5904739499092102, + "learning_rate": 1e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7654628753662109, + "num_tokens": 1389321216.0, + "step": 2899 + }, + { + "epoch": 1.7210682492581602, + "grad_norm": 0.5398632287979126, + "learning_rate": 1e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7707008123397827, + "num_tokens": 1389782186.0, + "step": 2900 + }, + { + "epoch": 1.7216617210682492, + "grad_norm": 0.5308007001876831, + "learning_rate": 1e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7720186710357666, + "num_tokens": 1390289650.0, + "step": 2901 + }, + { + "epoch": 1.7222551928783383, + "grad_norm": 0.552123486995697, + "learning_rate": 1e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7734553813934326, + "num_tokens": 1390799538.0, + "step": 2902 + }, + { + "epoch": 1.7228486646884273, + "grad_norm": 0.6225234270095825, + "learning_rate": 1e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7726777791976929, + "num_tokens": 1391236122.0, + "step": 2903 + }, + { + "epoch": 1.7234421364985164, + "grad_norm": 0.5496450662612915, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7530158162117004, + "num_tokens": 1391761377.0, + "step": 2904 + }, + { + "epoch": 1.7240356083086055, + "grad_norm": 0.52112877368927, + "learning_rate": 1e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7768512964248657, + "num_tokens": 1392277063.0, + "step": 2905 + }, + { + "epoch": 1.7246290801186943, + "grad_norm": 0.568763256072998, + "learning_rate": 1e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7680108547210693, + "num_tokens": 1392771934.0, + "step": 2906 + }, + { + "epoch": 1.7252225519287834, + "grad_norm": 0.5554589033126831, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7638636827468872, + "num_tokens": 1393218347.0, + "step": 2907 + }, + { + "epoch": 1.7258160237388724, + "grad_norm": 0.5601651072502136, + "learning_rate": 1e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7581660747528076, + "num_tokens": 1393664585.0, + "step": 2908 + }, + { + "epoch": 1.7264094955489613, + "grad_norm": 0.5353600382804871, + "learning_rate": 1e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.7688586115837097, + "num_tokens": 1394194302.0, + "step": 2909 + }, + { + "epoch": 1.7270029673590503, + "grad_norm": 0.5332889556884766, + "learning_rate": 1e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7700957655906677, + "num_tokens": 1394721583.0, + "step": 2910 + }, + { + "epoch": 1.7275964391691394, + "grad_norm": 0.5340728163719177, + "learning_rate": 1e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7701643705368042, + "num_tokens": 1395210417.0, + "step": 2911 + }, + { + "epoch": 1.7281899109792285, + "grad_norm": 0.5313318967819214, + "learning_rate": 1e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.754988431930542, + "num_tokens": 1395704722.0, + "step": 2912 + }, + { + "epoch": 1.7287833827893175, + "grad_norm": 0.5570839643478394, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7561115026473999, + "num_tokens": 1396138919.0, + "step": 2913 + }, + { + "epoch": 1.7293768545994066, + "grad_norm": 0.5649080872535706, + "learning_rate": 1e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7585226893424988, + "num_tokens": 1396591522.0, + "step": 2914 + }, + { + "epoch": 1.7299703264094957, + "grad_norm": 0.5272946357727051, + "learning_rate": 1e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7628223896026611, + "num_tokens": 1397074688.0, + "step": 2915 + }, + { + "epoch": 1.7305637982195847, + "grad_norm": 0.5328464508056641, + "learning_rate": 1e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7583417892456055, + "num_tokens": 1397527938.0, + "step": 2916 + }, + { + "epoch": 1.7311572700296736, + "grad_norm": 0.5104858875274658, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7624601125717163, + "num_tokens": 1398047077.0, + "step": 2917 + }, + { + "epoch": 1.7317507418397626, + "grad_norm": 0.5902320742607117, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7583352327346802, + "num_tokens": 1398534577.0, + "step": 2918 + }, + { + "epoch": 1.7323442136498515, + "grad_norm": 0.5244247913360596, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7679296731948853, + "num_tokens": 1399057596.0, + "step": 2919 + }, + { + "epoch": 1.7329376854599405, + "grad_norm": 0.5196854472160339, + "learning_rate": 1e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.7763161063194275, + "num_tokens": 1399560839.0, + "step": 2920 + }, + { + "epoch": 1.7335311572700296, + "grad_norm": 0.5457808375358582, + "learning_rate": 1e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7509583234786987, + "num_tokens": 1400027512.0, + "step": 2921 + }, + { + "epoch": 1.7341246290801187, + "grad_norm": 0.5719856023788452, + "learning_rate": 1e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7564160227775574, + "num_tokens": 1400476058.0, + "step": 2922 + }, + { + "epoch": 1.7347181008902077, + "grad_norm": 0.5518815517425537, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.75690096616745, + "num_tokens": 1400959049.0, + "step": 2923 + }, + { + "epoch": 1.7353115727002968, + "grad_norm": 0.5256893634796143, + "learning_rate": 1e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7650537490844727, + "num_tokens": 1401441926.0, + "step": 2924 + }, + { + "epoch": 1.7359050445103859, + "grad_norm": 0.5271478295326233, + "learning_rate": 1e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7678954601287842, + "num_tokens": 1401932683.0, + "step": 2925 + }, + { + "epoch": 1.736498516320475, + "grad_norm": 0.5376334190368652, + "learning_rate": 1e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7724161148071289, + "num_tokens": 1402414488.0, + "step": 2926 + }, + { + "epoch": 1.7370919881305638, + "grad_norm": 0.5031099915504456, + "learning_rate": 1e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.758754312992096, + "num_tokens": 1402976632.0, + "step": 2927 + }, + { + "epoch": 1.7376854599406528, + "grad_norm": 0.5210289359092712, + "learning_rate": 1e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.769858717918396, + "num_tokens": 1403451101.0, + "step": 2928 + }, + { + "epoch": 1.7382789317507419, + "grad_norm": 0.53221595287323, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7733384370803833, + "num_tokens": 1403938254.0, + "step": 2929 + }, + { + "epoch": 1.7388724035608307, + "grad_norm": 0.5491842031478882, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.757323145866394, + "num_tokens": 1404398404.0, + "step": 2930 + }, + { + "epoch": 1.7394658753709198, + "grad_norm": 0.5764380693435669, + "learning_rate": 1e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.7666786909103394, + "num_tokens": 1404849314.0, + "step": 2931 + }, + { + "epoch": 1.7400593471810089, + "grad_norm": 0.5488528609275818, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7501739859580994, + "num_tokens": 1405321342.0, + "step": 2932 + }, + { + "epoch": 1.740652818991098, + "grad_norm": 0.5367371439933777, + "learning_rate": 1e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7736606001853943, + "num_tokens": 1405809465.0, + "step": 2933 + }, + { + "epoch": 1.741246290801187, + "grad_norm": 0.554843544960022, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7517662048339844, + "num_tokens": 1406280175.0, + "step": 2934 + }, + { + "epoch": 1.741839762611276, + "grad_norm": 0.5353673696517944, + "learning_rate": 1e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7457011342048645, + "num_tokens": 1406762421.0, + "step": 2935 + }, + { + "epoch": 1.742433234421365, + "grad_norm": 0.5667372941970825, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.773754358291626, + "num_tokens": 1407195405.0, + "step": 2936 + }, + { + "epoch": 1.7430267062314542, + "grad_norm": 0.565528929233551, + "learning_rate": 1e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7850456237792969, + "num_tokens": 1407653231.0, + "step": 2937 + }, + { + "epoch": 1.743620178041543, + "grad_norm": 0.5160094499588013, + "learning_rate": 1e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7671541571617126, + "num_tokens": 1408163553.0, + "step": 2938 + }, + { + "epoch": 1.744213649851632, + "grad_norm": 0.5463855266571045, + "learning_rate": 1e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7772748470306396, + "num_tokens": 1408615022.0, + "step": 2939 + }, + { + "epoch": 1.744807121661721, + "grad_norm": 0.5708650350570679, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7613498568534851, + "num_tokens": 1409120496.0, + "step": 2940 + }, + { + "epoch": 1.74540059347181, + "grad_norm": 0.5281055569648743, + "learning_rate": 1e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7767893075942993, + "num_tokens": 1409616920.0, + "step": 2941 + }, + { + "epoch": 1.745994065281899, + "grad_norm": 0.5555668473243713, + "learning_rate": 1e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7566113471984863, + "num_tokens": 1410075946.0, + "step": 2942 + }, + { + "epoch": 1.746587537091988, + "grad_norm": 0.536058247089386, + "learning_rate": 1e-06, + "loss": 0.828, + "mean_token_accuracy": 0.7429171800613403, + "num_tokens": 1410535918.0, + "step": 2943 + }, + { + "epoch": 1.7471810089020772, + "grad_norm": 0.5400805473327637, + "learning_rate": 1e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.7675479650497437, + "num_tokens": 1411021964.0, + "step": 2944 + }, + { + "epoch": 1.7477744807121662, + "grad_norm": 0.5216708779335022, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7637038230895996, + "num_tokens": 1411536491.0, + "step": 2945 + }, + { + "epoch": 1.7483679525222553, + "grad_norm": 0.5286399722099304, + "learning_rate": 1e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7660627365112305, + "num_tokens": 1412051329.0, + "step": 2946 + }, + { + "epoch": 1.7489614243323444, + "grad_norm": 0.5594281554222107, + "learning_rate": 1e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7614277005195618, + "num_tokens": 1412489634.0, + "step": 2947 + }, + { + "epoch": 1.7495548961424332, + "grad_norm": 0.5440286993980408, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7599865198135376, + "num_tokens": 1412953951.0, + "step": 2948 + }, + { + "epoch": 1.7501483679525223, + "grad_norm": 0.5168679356575012, + "learning_rate": 1e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7684992551803589, + "num_tokens": 1413451154.0, + "step": 2949 + }, + { + "epoch": 1.7507418397626113, + "grad_norm": 0.5596041679382324, + "learning_rate": 1e-06, + "loss": 0.7578, + "mean_token_accuracy": 0.7598533034324646, + "num_tokens": 1413974297.0, + "step": 2950 + }, + { + "epoch": 1.7513353115727002, + "grad_norm": 0.5610448718070984, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7581981420516968, + "num_tokens": 1414436371.0, + "step": 2951 + }, + { + "epoch": 1.7519287833827892, + "grad_norm": 0.5664941668510437, + "learning_rate": 1e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7571420669555664, + "num_tokens": 1414946661.0, + "step": 2952 + }, + { + "epoch": 1.7525222551928783, + "grad_norm": 0.5371881127357483, + "learning_rate": 1e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7571073174476624, + "num_tokens": 1415404115.0, + "step": 2953 + }, + { + "epoch": 1.7531157270029674, + "grad_norm": 0.545409083366394, + "learning_rate": 1e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7823677062988281, + "num_tokens": 1415851708.0, + "step": 2954 + }, + { + "epoch": 1.7537091988130564, + "grad_norm": 0.5432368516921997, + "learning_rate": 1e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.7749441862106323, + "num_tokens": 1416311569.0, + "step": 2955 + }, + { + "epoch": 1.7543026706231455, + "grad_norm": 0.547568142414093, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7593834400177002, + "num_tokens": 1416785080.0, + "step": 2956 + }, + { + "epoch": 1.7548961424332346, + "grad_norm": 0.5429648756980896, + "learning_rate": 1e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7721942067146301, + "num_tokens": 1417270884.0, + "step": 2957 + }, + { + "epoch": 1.7554896142433234, + "grad_norm": 0.5486127734184265, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7639952301979065, + "num_tokens": 1417716931.0, + "step": 2958 + }, + { + "epoch": 1.7560830860534125, + "grad_norm": 0.5343555212020874, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.759128987789154, + "num_tokens": 1418191984.0, + "step": 2959 + }, + { + "epoch": 1.7566765578635015, + "grad_norm": 0.5407062768936157, + "learning_rate": 1e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7629460692405701, + "num_tokens": 1418676441.0, + "step": 2960 + }, + { + "epoch": 1.7572700296735904, + "grad_norm": 0.5262375473976135, + "learning_rate": 1e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7624605298042297, + "num_tokens": 1419169674.0, + "step": 2961 + }, + { + "epoch": 1.7578635014836794, + "grad_norm": 0.5654997825622559, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7676503658294678, + "num_tokens": 1419599581.0, + "step": 2962 + }, + { + "epoch": 1.7584569732937685, + "grad_norm": 0.551226019859314, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7622687816619873, + "num_tokens": 1420066774.0, + "step": 2963 + }, + { + "epoch": 1.7590504451038576, + "grad_norm": 0.5813509821891785, + "learning_rate": 1e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7464603185653687, + "num_tokens": 1420528361.0, + "step": 2964 + }, + { + "epoch": 1.7596439169139466, + "grad_norm": 0.5408374667167664, + "learning_rate": 1e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7751553058624268, + "num_tokens": 1420988393.0, + "step": 2965 + }, + { + "epoch": 1.7602373887240357, + "grad_norm": 0.5547805428504944, + "learning_rate": 1e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.7760186195373535, + "num_tokens": 1421456795.0, + "step": 2966 + }, + { + "epoch": 1.7608308605341247, + "grad_norm": 0.5296668410301208, + "learning_rate": 1e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7782690525054932, + "num_tokens": 1421936061.0, + "step": 2967 + }, + { + "epoch": 1.7614243323442138, + "grad_norm": 0.5299737453460693, + "learning_rate": 1e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7726345062255859, + "num_tokens": 1422432181.0, + "step": 2968 + }, + { + "epoch": 1.7620178041543026, + "grad_norm": 0.5553017854690552, + "learning_rate": 1e-06, + "loss": 0.768, + "mean_token_accuracy": 0.7590142488479614, + "num_tokens": 1422934003.0, + "step": 2969 + }, + { + "epoch": 1.7626112759643917, + "grad_norm": 0.5583130717277527, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7721575498580933, + "num_tokens": 1423377495.0, + "step": 2970 + }, + { + "epoch": 1.7632047477744806, + "grad_norm": 0.5384240746498108, + "learning_rate": 1e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7753375768661499, + "num_tokens": 1423875081.0, + "step": 2971 + }, + { + "epoch": 1.7637982195845696, + "grad_norm": 0.5333775877952576, + "learning_rate": 1e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7763444781303406, + "num_tokens": 1424413905.0, + "step": 2972 + }, + { + "epoch": 1.7643916913946587, + "grad_norm": 0.5286656022071838, + "learning_rate": 1e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.786172091960907, + "num_tokens": 1424930123.0, + "step": 2973 + }, + { + "epoch": 1.7649851632047477, + "grad_norm": 0.5087640285491943, + "learning_rate": 1e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7717652916908264, + "num_tokens": 1425447057.0, + "step": 2974 + }, + { + "epoch": 1.7655786350148368, + "grad_norm": 0.5426702499389648, + "learning_rate": 1e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7661615610122681, + "num_tokens": 1425921151.0, + "step": 2975 + }, + { + "epoch": 1.7661721068249259, + "grad_norm": 0.576615035533905, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7618700265884399, + "num_tokens": 1426385438.0, + "step": 2976 + }, + { + "epoch": 1.766765578635015, + "grad_norm": 0.5405709147453308, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7731631994247437, + "num_tokens": 1426883209.0, + "step": 2977 + }, + { + "epoch": 1.767359050445104, + "grad_norm": 0.5469104051589966, + "learning_rate": 1e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.746831476688385, + "num_tokens": 1427376588.0, + "step": 2978 + }, + { + "epoch": 1.7679525222551928, + "grad_norm": 0.5780754089355469, + "learning_rate": 1e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7810337543487549, + "num_tokens": 1427841377.0, + "step": 2979 + }, + { + "epoch": 1.768545994065282, + "grad_norm": 0.57905113697052, + "learning_rate": 1e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7670891284942627, + "num_tokens": 1428298852.0, + "step": 2980 + }, + { + "epoch": 1.769139465875371, + "grad_norm": 0.5792118906974792, + "learning_rate": 1e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7751063108444214, + "num_tokens": 1428800861.0, + "step": 2981 + }, + { + "epoch": 1.7697329376854598, + "grad_norm": 0.5744754672050476, + "learning_rate": 1e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.768256664276123, + "num_tokens": 1429274542.0, + "step": 2982 + }, + { + "epoch": 1.7703264094955489, + "grad_norm": 0.5732820630073547, + "learning_rate": 1e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7618950009346008, + "num_tokens": 1429756659.0, + "step": 2983 + }, + { + "epoch": 1.770919881305638, + "grad_norm": 0.5216343402862549, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7745898365974426, + "num_tokens": 1430245242.0, + "step": 2984 + }, + { + "epoch": 1.771513353115727, + "grad_norm": 0.5726694464683533, + "learning_rate": 1e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.758670449256897, + "num_tokens": 1430750023.0, + "step": 2985 + }, + { + "epoch": 1.772106824925816, + "grad_norm": 0.5193004608154297, + "learning_rate": 1e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.770290732383728, + "num_tokens": 1431304724.0, + "step": 2986 + }, + { + "epoch": 1.7727002967359051, + "grad_norm": 0.5698366165161133, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7614518404006958, + "num_tokens": 1431779901.0, + "step": 2987 + }, + { + "epoch": 1.7732937685459942, + "grad_norm": 0.5230977535247803, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7723194360733032, + "num_tokens": 1432262152.0, + "step": 2988 + }, + { + "epoch": 1.7738872403560833, + "grad_norm": 0.548768937587738, + "learning_rate": 1e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.763861894607544, + "num_tokens": 1432799331.0, + "step": 2989 + }, + { + "epoch": 1.774480712166172, + "grad_norm": 0.635124146938324, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7568318843841553, + "num_tokens": 1433255901.0, + "step": 2990 + }, + { + "epoch": 1.7750741839762612, + "grad_norm": 0.5375957489013672, + "learning_rate": 1e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.775097131729126, + "num_tokens": 1433742153.0, + "step": 2991 + }, + { + "epoch": 1.77566765578635, + "grad_norm": 0.5477784872055054, + "learning_rate": 1e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.774142861366272, + "num_tokens": 1434216317.0, + "step": 2992 + }, + { + "epoch": 1.776261127596439, + "grad_norm": 0.5398727059364319, + "learning_rate": 1e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7612334489822388, + "num_tokens": 1434728848.0, + "step": 2993 + }, + { + "epoch": 1.7768545994065281, + "grad_norm": 0.5589633584022522, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7638574242591858, + "num_tokens": 1435184267.0, + "step": 2994 + }, + { + "epoch": 1.7774480712166172, + "grad_norm": 0.5500018000602722, + "learning_rate": 1e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7728806734085083, + "num_tokens": 1435674319.0, + "step": 2995 + }, + { + "epoch": 1.7780415430267063, + "grad_norm": 0.5634213089942932, + "learning_rate": 1e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7782250642776489, + "num_tokens": 1436139180.0, + "step": 2996 + }, + { + "epoch": 1.7786350148367953, + "grad_norm": 0.5504935383796692, + "learning_rate": 1e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7731571197509766, + "num_tokens": 1436616058.0, + "step": 2997 + }, + { + "epoch": 1.7792284866468844, + "grad_norm": 0.5590364933013916, + "learning_rate": 1e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7693374156951904, + "num_tokens": 1437132926.0, + "step": 2998 + }, + { + "epoch": 1.7798219584569734, + "grad_norm": 0.5757502913475037, + "learning_rate": 1e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7628951072692871, + "num_tokens": 1437619967.0, + "step": 2999 + }, + { + "epoch": 1.7804154302670623, + "grad_norm": 0.5085957646369934, + "learning_rate": 1e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7856399416923523, + "num_tokens": 1438151368.0, + "step": 3000 + }, + { + "epoch": 1.7810089020771513, + "grad_norm": 0.5304433107376099, + "learning_rate": 1e-06, + "loss": 0.711, + "mean_token_accuracy": 0.771857500076294, + "num_tokens": 1438612096.0, + "step": 3001 + }, + { + "epoch": 1.7816023738872404, + "grad_norm": 0.5174245238304138, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7674057483673096, + "num_tokens": 1439142841.0, + "step": 3002 + }, + { + "epoch": 1.7821958456973293, + "grad_norm": 0.5372490882873535, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7590034008026123, + "num_tokens": 1439665611.0, + "step": 3003 + }, + { + "epoch": 1.7827893175074183, + "grad_norm": 0.5360315442085266, + "learning_rate": 1e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7739050388336182, + "num_tokens": 1440129837.0, + "step": 3004 + }, + { + "epoch": 1.7833827893175074, + "grad_norm": 0.5500688552856445, + "learning_rate": 1e-06, + "loss": 0.692, + "mean_token_accuracy": 0.7794189453125, + "num_tokens": 1440598237.0, + "step": 3005 + }, + { + "epoch": 1.7839762611275964, + "grad_norm": 0.5669185519218445, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7715235352516174, + "num_tokens": 1441067105.0, + "step": 3006 + }, + { + "epoch": 1.7845697329376855, + "grad_norm": 0.5274993777275085, + "learning_rate": 1e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7541733980178833, + "num_tokens": 1441595784.0, + "step": 3007 + }, + { + "epoch": 1.7851632047477746, + "grad_norm": 0.5733584761619568, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7628318667411804, + "num_tokens": 1442038029.0, + "step": 3008 + }, + { + "epoch": 1.7857566765578636, + "grad_norm": 0.5961384773254395, + "learning_rate": 1e-06, + "loss": 0.753, + "mean_token_accuracy": 0.765587568283081, + "num_tokens": 1442513429.0, + "step": 3009 + }, + { + "epoch": 1.7863501483679525, + "grad_norm": 0.5448429584503174, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7570022344589233, + "num_tokens": 1442953598.0, + "step": 3010 + }, + { + "epoch": 1.7869436201780415, + "grad_norm": 0.5629423260688782, + "learning_rate": 1e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.7523527145385742, + "num_tokens": 1443427500.0, + "step": 3011 + }, + { + "epoch": 1.7875370919881306, + "grad_norm": 0.5302088260650635, + "learning_rate": 1e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7784931063652039, + "num_tokens": 1443943081.0, + "step": 3012 + }, + { + "epoch": 1.7881305637982194, + "grad_norm": 0.5482978224754333, + "learning_rate": 1e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7754074335098267, + "num_tokens": 1444416686.0, + "step": 3013 + }, + { + "epoch": 1.7887240356083085, + "grad_norm": 0.5736011862754822, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7725875973701477, + "num_tokens": 1444877197.0, + "step": 3014 + }, + { + "epoch": 1.7893175074183976, + "grad_norm": 0.5679749846458435, + "learning_rate": 1e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7758564949035645, + "num_tokens": 1445346392.0, + "step": 3015 + }, + { + "epoch": 1.7899109792284866, + "grad_norm": 0.5588710904121399, + "learning_rate": 1e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7678399085998535, + "num_tokens": 1445826653.0, + "step": 3016 + }, + { + "epoch": 1.7905044510385757, + "grad_norm": 0.5346249341964722, + "learning_rate": 1e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.764674186706543, + "num_tokens": 1446369011.0, + "step": 3017 + }, + { + "epoch": 1.7910979228486648, + "grad_norm": 0.534619927406311, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7668923139572144, + "num_tokens": 1446846393.0, + "step": 3018 + }, + { + "epoch": 1.7916913946587538, + "grad_norm": 0.5437115430831909, + "learning_rate": 1e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7657647132873535, + "num_tokens": 1447314141.0, + "step": 3019 + }, + { + "epoch": 1.7922848664688429, + "grad_norm": 0.5144177675247192, + "learning_rate": 1e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7610079050064087, + "num_tokens": 1447805610.0, + "step": 3020 + }, + { + "epoch": 1.7928783382789317, + "grad_norm": 0.5193988084793091, + "learning_rate": 1e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.778990626335144, + "num_tokens": 1448350277.0, + "step": 3021 + }, + { + "epoch": 1.7934718100890208, + "grad_norm": 0.5887545347213745, + "learning_rate": 1e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7681873440742493, + "num_tokens": 1448840941.0, + "step": 3022 + }, + { + "epoch": 1.7940652818991096, + "grad_norm": 0.567484438419342, + "learning_rate": 1e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7720481157302856, + "num_tokens": 1449287279.0, + "step": 3023 + }, + { + "epoch": 1.7946587537091987, + "grad_norm": 0.5045203566551208, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.765576183795929, + "num_tokens": 1449819350.0, + "step": 3024 + }, + { + "epoch": 1.7952522255192878, + "grad_norm": 0.5388022661209106, + "learning_rate": 1e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7597538232803345, + "num_tokens": 1450312208.0, + "step": 3025 + }, + { + "epoch": 1.7958456973293768, + "grad_norm": 0.5625734925270081, + "learning_rate": 1e-06, + "loss": 0.7061, + "mean_token_accuracy": 0.7754545211791992, + "num_tokens": 1450790468.0, + "step": 3026 + }, + { + "epoch": 1.7964391691394659, + "grad_norm": 0.5447664856910706, + "learning_rate": 1e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.7635756134986877, + "num_tokens": 1451313259.0, + "step": 3027 + }, + { + "epoch": 1.797032640949555, + "grad_norm": 0.5536615252494812, + "learning_rate": 1e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.7581906318664551, + "num_tokens": 1451779770.0, + "step": 3028 + }, + { + "epoch": 1.797626112759644, + "grad_norm": 0.6212976574897766, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7581838965415955, + "num_tokens": 1452171644.0, + "step": 3029 + }, + { + "epoch": 1.798219584569733, + "grad_norm": 0.5564117431640625, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7616633176803589, + "num_tokens": 1452667771.0, + "step": 3030 + }, + { + "epoch": 1.798813056379822, + "grad_norm": 0.5141096115112305, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7595272064208984, + "num_tokens": 1453204486.0, + "step": 3031 + }, + { + "epoch": 1.799406528189911, + "grad_norm": 0.526129424571991, + "learning_rate": 1e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7694653272628784, + "num_tokens": 1453670354.0, + "step": 3032 + }, + { + "epoch": 1.8, + "grad_norm": 0.5670618414878845, + "learning_rate": 1e-06, + "loss": 0.7095, + "mean_token_accuracy": 0.7744349241256714, + "num_tokens": 1454096370.0, + "step": 3033 + }, + { + "epoch": 1.8005934718100889, + "grad_norm": 0.5569489598274231, + "learning_rate": 1e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.7845461368560791, + "num_tokens": 1454554965.0, + "step": 3034 + }, + { + "epoch": 1.801186943620178, + "grad_norm": 0.5683603882789612, + "learning_rate": 1e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7528807520866394, + "num_tokens": 1455030929.0, + "step": 3035 + }, + { + "epoch": 1.801780415430267, + "grad_norm": 0.5457832217216492, + "learning_rate": 1e-06, + "loss": 0.6921, + "mean_token_accuracy": 0.7793761491775513, + "num_tokens": 1455513783.0, + "step": 3036 + }, + { + "epoch": 1.802373887240356, + "grad_norm": 0.589185893535614, + "learning_rate": 1e-06, + "loss": 0.759, + "mean_token_accuracy": 0.760209858417511, + "num_tokens": 1455973198.0, + "step": 3037 + }, + { + "epoch": 1.8029673590504451, + "grad_norm": 0.5406283736228943, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7677319049835205, + "num_tokens": 1456511657.0, + "step": 3038 + }, + { + "epoch": 1.8035608308605342, + "grad_norm": 0.5764794945716858, + "learning_rate": 1e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7626974582672119, + "num_tokens": 1456947419.0, + "step": 3039 + }, + { + "epoch": 1.8041543026706233, + "grad_norm": 0.5744596123695374, + "learning_rate": 1e-06, + "loss": 0.6661, + "mean_token_accuracy": 0.7869329452514648, + "num_tokens": 1457415112.0, + "step": 3040 + }, + { + "epoch": 1.8047477744807123, + "grad_norm": 0.5400311946868896, + "learning_rate": 1e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7647439241409302, + "num_tokens": 1457934415.0, + "step": 3041 + }, + { + "epoch": 1.8053412462908012, + "grad_norm": 0.5325559377670288, + "learning_rate": 1e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7638974189758301, + "num_tokens": 1458412244.0, + "step": 3042 + }, + { + "epoch": 1.8059347181008902, + "grad_norm": 0.5239063501358032, + "learning_rate": 1e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7582001686096191, + "num_tokens": 1458933812.0, + "step": 3043 + }, + { + "epoch": 1.806528189910979, + "grad_norm": 0.5309938192367554, + "learning_rate": 1e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7750319242477417, + "num_tokens": 1459444949.0, + "step": 3044 + }, + { + "epoch": 1.8071216617210681, + "grad_norm": 0.5427067875862122, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7707103490829468, + "num_tokens": 1459913672.0, + "step": 3045 + }, + { + "epoch": 1.8077151335311572, + "grad_norm": 0.5558195114135742, + "learning_rate": 1e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7747159600257874, + "num_tokens": 1460388241.0, + "step": 3046 + }, + { + "epoch": 1.8083086053412463, + "grad_norm": 0.5577951073646545, + "learning_rate": 1e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7691789865493774, + "num_tokens": 1460896523.0, + "step": 3047 + }, + { + "epoch": 1.8089020771513353, + "grad_norm": 0.5439597964286804, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7648710012435913, + "num_tokens": 1461346694.0, + "step": 3048 + }, + { + "epoch": 1.8094955489614244, + "grad_norm": 0.5279859304428101, + "learning_rate": 1e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7754678726196289, + "num_tokens": 1461834740.0, + "step": 3049 + }, + { + "epoch": 1.8100890207715135, + "grad_norm": 0.5550218820571899, + "learning_rate": 1e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7647634744644165, + "num_tokens": 1462300399.0, + "step": 3050 + }, + { + "epoch": 1.8106824925816025, + "grad_norm": 0.5690341591835022, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7633283734321594, + "num_tokens": 1462728988.0, + "step": 3051 + }, + { + "epoch": 1.8112759643916914, + "grad_norm": 0.5280082821846008, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7707729339599609, + "num_tokens": 1463229832.0, + "step": 3052 + }, + { + "epoch": 1.8118694362017804, + "grad_norm": 0.5385752320289612, + "learning_rate": 1e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.7571876049041748, + "num_tokens": 1463703896.0, + "step": 3053 + }, + { + "epoch": 1.8124629080118695, + "grad_norm": 0.5502560138702393, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7651129961013794, + "num_tokens": 1464165740.0, + "step": 3054 + }, + { + "epoch": 1.8130563798219583, + "grad_norm": 0.5227601528167725, + "learning_rate": 1e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7750523090362549, + "num_tokens": 1464648001.0, + "step": 3055 + }, + { + "epoch": 1.8136498516320474, + "grad_norm": 0.5427488684654236, + "learning_rate": 1e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.7560920119285583, + "num_tokens": 1465115569.0, + "step": 3056 + }, + { + "epoch": 1.8142433234421365, + "grad_norm": 0.5216299295425415, + "learning_rate": 1e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7709183692932129, + "num_tokens": 1465597908.0, + "step": 3057 + }, + { + "epoch": 1.8148367952522255, + "grad_norm": 0.5406325459480286, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7697786688804626, + "num_tokens": 1466073435.0, + "step": 3058 + }, + { + "epoch": 1.8154302670623146, + "grad_norm": 0.5634597539901733, + "learning_rate": 1e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7642183303833008, + "num_tokens": 1466561948.0, + "step": 3059 + }, + { + "epoch": 1.8160237388724036, + "grad_norm": 0.5262029767036438, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.761491596698761, + "num_tokens": 1467045702.0, + "step": 3060 + }, + { + "epoch": 1.8166172106824927, + "grad_norm": 0.5399943590164185, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7695556282997131, + "num_tokens": 1467516068.0, + "step": 3061 + }, + { + "epoch": 1.8172106824925816, + "grad_norm": 0.5448071360588074, + "learning_rate": 1e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7811168432235718, + "num_tokens": 1467984724.0, + "step": 3062 + }, + { + "epoch": 1.8178041543026706, + "grad_norm": 0.5549396872520447, + "learning_rate": 1e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7752673029899597, + "num_tokens": 1468487205.0, + "step": 3063 + }, + { + "epoch": 1.8183976261127597, + "grad_norm": 0.5244814157485962, + "learning_rate": 1e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.7735004425048828, + "num_tokens": 1468968509.0, + "step": 3064 + }, + { + "epoch": 1.8189910979228485, + "grad_norm": 0.5352152585983276, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7679793834686279, + "num_tokens": 1469437489.0, + "step": 3065 + }, + { + "epoch": 1.8195845697329376, + "grad_norm": 0.5741782784461975, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7687498331069946, + "num_tokens": 1469941470.0, + "step": 3066 + }, + { + "epoch": 1.8201780415430266, + "grad_norm": 0.5554210543632507, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7712637186050415, + "num_tokens": 1470441855.0, + "step": 3067 + }, + { + "epoch": 1.8207715133531157, + "grad_norm": 0.5519924163818359, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7632941603660583, + "num_tokens": 1470925948.0, + "step": 3068 + }, + { + "epoch": 1.8213649851632048, + "grad_norm": 0.5908061265945435, + "learning_rate": 1e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7556831240653992, + "num_tokens": 1471372593.0, + "step": 3069 + }, + { + "epoch": 1.8219584569732938, + "grad_norm": 0.5731983780860901, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7731345295906067, + "num_tokens": 1471856120.0, + "step": 3070 + }, + { + "epoch": 1.822551928783383, + "grad_norm": 0.5757054686546326, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7634543776512146, + "num_tokens": 1472309940.0, + "step": 3071 + }, + { + "epoch": 1.823145400593472, + "grad_norm": 0.5631248950958252, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7601854205131531, + "num_tokens": 1472793895.0, + "step": 3072 + }, + { + "epoch": 1.8237388724035608, + "grad_norm": 0.5821793675422668, + "learning_rate": 1e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.766066312789917, + "num_tokens": 1473292217.0, + "step": 3073 + }, + { + "epoch": 1.8243323442136499, + "grad_norm": 0.6267396211624146, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7664471864700317, + "num_tokens": 1473716962.0, + "step": 3074 + }, + { + "epoch": 1.8249258160237387, + "grad_norm": 0.5788608193397522, + "learning_rate": 1e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7532833814620972, + "num_tokens": 1474152186.0, + "step": 3075 + }, + { + "epoch": 1.8255192878338278, + "grad_norm": 0.5250398516654968, + "learning_rate": 1e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.7661924362182617, + "num_tokens": 1474676439.0, + "step": 3076 + }, + { + "epoch": 1.8261127596439168, + "grad_norm": 0.554288387298584, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7693637013435364, + "num_tokens": 1475102468.0, + "step": 3077 + }, + { + "epoch": 1.826706231454006, + "grad_norm": 0.5970165729522705, + "learning_rate": 1e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7654531598091125, + "num_tokens": 1475554125.0, + "step": 3078 + }, + { + "epoch": 1.827299703264095, + "grad_norm": 0.5403764843940735, + "learning_rate": 1e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7669905424118042, + "num_tokens": 1475997104.0, + "step": 3079 + }, + { + "epoch": 1.827893175074184, + "grad_norm": 0.5305481553077698, + "learning_rate": 1e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7599564790725708, + "num_tokens": 1476500062.0, + "step": 3080 + }, + { + "epoch": 1.828486646884273, + "grad_norm": 0.5840716361999512, + "learning_rate": 1e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7658836841583252, + "num_tokens": 1476909596.0, + "step": 3081 + }, + { + "epoch": 1.8290801186943622, + "grad_norm": 0.5370060801506042, + "learning_rate": 1e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.769818127155304, + "num_tokens": 1477389333.0, + "step": 3082 + }, + { + "epoch": 1.829673590504451, + "grad_norm": 0.5561617612838745, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7490493655204773, + "num_tokens": 1477840242.0, + "step": 3083 + }, + { + "epoch": 1.83026706231454, + "grad_norm": 0.5257545113563538, + "learning_rate": 1e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7649437189102173, + "num_tokens": 1478364485.0, + "step": 3084 + }, + { + "epoch": 1.8308605341246291, + "grad_norm": 0.5331987738609314, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7628118991851807, + "num_tokens": 1478828930.0, + "step": 3085 + }, + { + "epoch": 1.831454005934718, + "grad_norm": 0.5397883653640747, + "learning_rate": 1e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7676724195480347, + "num_tokens": 1479288468.0, + "step": 3086 + }, + { + "epoch": 1.832047477744807, + "grad_norm": 0.5201759338378906, + "learning_rate": 1e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7709657549858093, + "num_tokens": 1479761394.0, + "step": 3087 + }, + { + "epoch": 1.832640949554896, + "grad_norm": 0.519324004650116, + "learning_rate": 1e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7672014832496643, + "num_tokens": 1480235761.0, + "step": 3088 + }, + { + "epoch": 1.8332344213649852, + "grad_norm": 0.5295624136924744, + "learning_rate": 1e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7692554593086243, + "num_tokens": 1480711831.0, + "step": 3089 + }, + { + "epoch": 1.8338278931750742, + "grad_norm": 0.5397415161132812, + "learning_rate": 1e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7747887372970581, + "num_tokens": 1481211039.0, + "step": 3090 + }, + { + "epoch": 1.8344213649851633, + "grad_norm": 0.5303191542625427, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.763890266418457, + "num_tokens": 1481711126.0, + "step": 3091 + }, + { + "epoch": 1.8350148367952523, + "grad_norm": 0.49019768834114075, + "learning_rate": 1e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7730128765106201, + "num_tokens": 1482256949.0, + "step": 3092 + }, + { + "epoch": 1.8356083086053414, + "grad_norm": 0.5285896062850952, + "learning_rate": 1e-06, + "loss": 0.6782, + "mean_token_accuracy": 0.7828572988510132, + "num_tokens": 1482716901.0, + "step": 3093 + }, + { + "epoch": 1.8362017804154303, + "grad_norm": 0.5529298186302185, + "learning_rate": 1e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7648595571517944, + "num_tokens": 1483166712.0, + "step": 3094 + }, + { + "epoch": 1.8367952522255193, + "grad_norm": 0.5031054615974426, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7689017057418823, + "num_tokens": 1483671242.0, + "step": 3095 + }, + { + "epoch": 1.8373887240356082, + "grad_norm": 0.5312634110450745, + "learning_rate": 1e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7519450187683105, + "num_tokens": 1484140288.0, + "step": 3096 + }, + { + "epoch": 1.8379821958456972, + "grad_norm": 0.5866870880126953, + "learning_rate": 1e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.766664445400238, + "num_tokens": 1484568885.0, + "step": 3097 + }, + { + "epoch": 1.8385756676557863, + "grad_norm": 0.560161828994751, + "learning_rate": 1e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7655940055847168, + "num_tokens": 1485023290.0, + "step": 3098 + }, + { + "epoch": 1.8391691394658753, + "grad_norm": 0.5666797757148743, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7746692895889282, + "num_tokens": 1485481855.0, + "step": 3099 + }, + { + "epoch": 1.8397626112759644, + "grad_norm": 0.5594010949134827, + "learning_rate": 1e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.7731499671936035, + "num_tokens": 1485889311.0, + "step": 3100 + }, + { + "epoch": 1.8403560830860535, + "grad_norm": 0.5340688228607178, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7634121179580688, + "num_tokens": 1486369375.0, + "step": 3101 + }, + { + "epoch": 1.8409495548961425, + "grad_norm": 0.5530672669410706, + "learning_rate": 1e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.767015814781189, + "num_tokens": 1486825386.0, + "step": 3102 + }, + { + "epoch": 1.8415430267062316, + "grad_norm": 0.5516998171806335, + "learning_rate": 1e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7726854085922241, + "num_tokens": 1487275626.0, + "step": 3103 + }, + { + "epoch": 1.8421364985163204, + "grad_norm": 0.5484156012535095, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7639484405517578, + "num_tokens": 1487774327.0, + "step": 3104 + }, + { + "epoch": 1.8427299703264095, + "grad_norm": 0.5311696529388428, + "learning_rate": 1e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7662434577941895, + "num_tokens": 1488263710.0, + "step": 3105 + }, + { + "epoch": 1.8433234421364986, + "grad_norm": 0.5420921444892883, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7544998526573181, + "num_tokens": 1488725206.0, + "step": 3106 + }, + { + "epoch": 1.8439169139465874, + "grad_norm": 0.5560300946235657, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7479031085968018, + "num_tokens": 1489186122.0, + "step": 3107 + }, + { + "epoch": 1.8445103857566765, + "grad_norm": 0.5336573719978333, + "learning_rate": 1e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7592134475708008, + "num_tokens": 1489676505.0, + "step": 3108 + }, + { + "epoch": 1.8451038575667655, + "grad_norm": 0.5201273560523987, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7650526762008667, + "num_tokens": 1490188820.0, + "step": 3109 + }, + { + "epoch": 1.8456973293768546, + "grad_norm": 0.5269899368286133, + "learning_rate": 1e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.750019371509552, + "num_tokens": 1490703532.0, + "step": 3110 + }, + { + "epoch": 1.8462908011869437, + "grad_norm": 0.5675714612007141, + "learning_rate": 1e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7659790515899658, + "num_tokens": 1491167084.0, + "step": 3111 + }, + { + "epoch": 1.8468842729970327, + "grad_norm": 0.5252300500869751, + "learning_rate": 1e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.7735026478767395, + "num_tokens": 1491656429.0, + "step": 3112 + }, + { + "epoch": 1.8474777448071218, + "grad_norm": 0.5776410698890686, + "learning_rate": 1e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7609881162643433, + "num_tokens": 1492096738.0, + "step": 3113 + }, + { + "epoch": 1.8480712166172106, + "grad_norm": 0.5153350830078125, + "learning_rate": 1e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7676233053207397, + "num_tokens": 1492566243.0, + "step": 3114 + }, + { + "epoch": 1.8486646884272997, + "grad_norm": 0.532269299030304, + "learning_rate": 1e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7766973972320557, + "num_tokens": 1493055268.0, + "step": 3115 + }, + { + "epoch": 1.8492581602373888, + "grad_norm": 0.5053638219833374, + "learning_rate": 1e-06, + "loss": 0.691, + "mean_token_accuracy": 0.7784945964813232, + "num_tokens": 1493590423.0, + "step": 3116 + }, + { + "epoch": 1.8498516320474776, + "grad_norm": 0.559744656085968, + "learning_rate": 1e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7512168884277344, + "num_tokens": 1494034545.0, + "step": 3117 + }, + { + "epoch": 1.8504451038575667, + "grad_norm": 0.5727823972702026, + "learning_rate": 1e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.7795575261116028, + "num_tokens": 1494516424.0, + "step": 3118 + }, + { + "epoch": 1.8510385756676557, + "grad_norm": 0.5253838300704956, + "learning_rate": 1e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7787896394729614, + "num_tokens": 1495032048.0, + "step": 3119 + }, + { + "epoch": 1.8516320474777448, + "grad_norm": 0.506526529788971, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7702209949493408, + "num_tokens": 1495573362.0, + "step": 3120 + }, + { + "epoch": 1.8522255192878339, + "grad_norm": 0.5728347301483154, + "learning_rate": 1e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.756159245967865, + "num_tokens": 1496004647.0, + "step": 3121 + }, + { + "epoch": 1.852818991097923, + "grad_norm": 0.5500638484954834, + "learning_rate": 1e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7627174258232117, + "num_tokens": 1496468819.0, + "step": 3122 + }, + { + "epoch": 1.853412462908012, + "grad_norm": 0.5572510957717896, + "learning_rate": 1e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7543566823005676, + "num_tokens": 1496952527.0, + "step": 3123 + }, + { + "epoch": 1.854005934718101, + "grad_norm": 0.5775510668754578, + "learning_rate": 1e-06, + "loss": 0.6693, + "mean_token_accuracy": 0.7847760319709778, + "num_tokens": 1497439983.0, + "step": 3124 + }, + { + "epoch": 1.8545994065281899, + "grad_norm": 0.5225672721862793, + "learning_rate": 1e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.758529782295227, + "num_tokens": 1497984103.0, + "step": 3125 + }, + { + "epoch": 1.855192878338279, + "grad_norm": 0.5705183148384094, + "learning_rate": 1e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7619791626930237, + "num_tokens": 1498453997.0, + "step": 3126 + }, + { + "epoch": 1.8557863501483678, + "grad_norm": 0.5169898867607117, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7596927881240845, + "num_tokens": 1498950093.0, + "step": 3127 + }, + { + "epoch": 1.8563798219584569, + "grad_norm": 0.5449495911598206, + "learning_rate": 1e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.751753568649292, + "num_tokens": 1499407772.0, + "step": 3128 + }, + { + "epoch": 1.856973293768546, + "grad_norm": 0.5542178750038147, + "learning_rate": 1e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7645198106765747, + "num_tokens": 1499890094.0, + "step": 3129 + }, + { + "epoch": 1.857566765578635, + "grad_norm": 0.5352562069892883, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7531328201293945, + "num_tokens": 1500359051.0, + "step": 3130 + }, + { + "epoch": 1.858160237388724, + "grad_norm": 0.5380992889404297, + "learning_rate": 1e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7706705331802368, + "num_tokens": 1500813313.0, + "step": 3131 + }, + { + "epoch": 1.858753709198813, + "grad_norm": 0.499584436416626, + "learning_rate": 1e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7764033079147339, + "num_tokens": 1501350789.0, + "step": 3132 + }, + { + "epoch": 1.8593471810089022, + "grad_norm": 0.5439760684967041, + "learning_rate": 1e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7666388154029846, + "num_tokens": 1501852016.0, + "step": 3133 + }, + { + "epoch": 1.8599406528189912, + "grad_norm": 0.5831413865089417, + "learning_rate": 1e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7472866773605347, + "num_tokens": 1502290960.0, + "step": 3134 + }, + { + "epoch": 1.86053412462908, + "grad_norm": 0.5559849739074707, + "learning_rate": 1e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7643630504608154, + "num_tokens": 1502745738.0, + "step": 3135 + }, + { + "epoch": 1.8611275964391691, + "grad_norm": 0.5498529076576233, + "learning_rate": 1e-06, + "loss": 0.6701, + "mean_token_accuracy": 0.7861420512199402, + "num_tokens": 1503214370.0, + "step": 3136 + }, + { + "epoch": 1.8617210682492582, + "grad_norm": 0.5342040061950684, + "learning_rate": 1e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7743865847587585, + "num_tokens": 1503718794.0, + "step": 3137 + }, + { + "epoch": 1.862314540059347, + "grad_norm": 0.5576180815696716, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7638124227523804, + "num_tokens": 1504203953.0, + "step": 3138 + }, + { + "epoch": 1.862908011869436, + "grad_norm": 0.5559703707695007, + "learning_rate": 1e-06, + "loss": 0.7304, + "mean_token_accuracy": 0.7684094905853271, + "num_tokens": 1504671434.0, + "step": 3139 + }, + { + "epoch": 1.8635014836795252, + "grad_norm": 0.5456222891807556, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.765581488609314, + "num_tokens": 1505180169.0, + "step": 3140 + }, + { + "epoch": 1.8640949554896142, + "grad_norm": 0.5602967143058777, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7722431421279907, + "num_tokens": 1505617057.0, + "step": 3141 + }, + { + "epoch": 1.8646884272997033, + "grad_norm": 0.5484112501144409, + "learning_rate": 1e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.757994532585144, + "num_tokens": 1506114594.0, + "step": 3142 + }, + { + "epoch": 1.8652818991097924, + "grad_norm": 0.567348062992096, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7550517320632935, + "num_tokens": 1506569860.0, + "step": 3143 + }, + { + "epoch": 1.8658753709198814, + "grad_norm": 0.546978771686554, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.768976092338562, + "num_tokens": 1507045693.0, + "step": 3144 + }, + { + "epoch": 1.8664688427299705, + "grad_norm": 0.5537495017051697, + "learning_rate": 1e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7688548564910889, + "num_tokens": 1507507319.0, + "step": 3145 + }, + { + "epoch": 1.8670623145400593, + "grad_norm": 0.5598145723342896, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7570194602012634, + "num_tokens": 1508013745.0, + "step": 3146 + }, + { + "epoch": 1.8676557863501484, + "grad_norm": 0.5665293335914612, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7569712400436401, + "num_tokens": 1508472834.0, + "step": 3147 + }, + { + "epoch": 1.8682492581602372, + "grad_norm": 0.529646098613739, + "learning_rate": 1e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7722189426422119, + "num_tokens": 1508970973.0, + "step": 3148 + }, + { + "epoch": 1.8688427299703263, + "grad_norm": 0.5254802107810974, + "learning_rate": 1e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.769894003868103, + "num_tokens": 1509452109.0, + "step": 3149 + }, + { + "epoch": 1.8694362017804154, + "grad_norm": 0.5615813136100769, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7698184251785278, + "num_tokens": 1509887153.0, + "step": 3150 + }, + { + "epoch": 1.8700296735905044, + "grad_norm": 0.5502499341964722, + "learning_rate": 1e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7775638103485107, + "num_tokens": 1510359919.0, + "step": 3151 + }, + { + "epoch": 1.8706231454005935, + "grad_norm": 0.5509423017501831, + "learning_rate": 1e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7742828726768494, + "num_tokens": 1510808025.0, + "step": 3152 + }, + { + "epoch": 1.8712166172106826, + "grad_norm": 0.5757281184196472, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7505617141723633, + "num_tokens": 1511258573.0, + "step": 3153 + }, + { + "epoch": 1.8718100890207716, + "grad_norm": 0.5567874312400818, + "learning_rate": 1e-06, + "loss": 0.6886, + "mean_token_accuracy": 0.7798486948013306, + "num_tokens": 1511740787.0, + "step": 3154 + }, + { + "epoch": 1.8724035608308607, + "grad_norm": 0.5535914897918701, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.741738498210907, + "num_tokens": 1512216796.0, + "step": 3155 + }, + { + "epoch": 1.8729970326409495, + "grad_norm": 0.5423860549926758, + "learning_rate": 1e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.752685010433197, + "num_tokens": 1512673281.0, + "step": 3156 + }, + { + "epoch": 1.8735905044510386, + "grad_norm": 0.5484465956687927, + "learning_rate": 1e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7647291421890259, + "num_tokens": 1513160164.0, + "step": 3157 + }, + { + "epoch": 1.8741839762611276, + "grad_norm": 0.5687018632888794, + "learning_rate": 1e-06, + "loss": 0.78, + "mean_token_accuracy": 0.7567993998527527, + "num_tokens": 1513634884.0, + "step": 3158 + }, + { + "epoch": 1.8747774480712165, + "grad_norm": 0.5442213416099548, + "learning_rate": 1e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.762660026550293, + "num_tokens": 1514112998.0, + "step": 3159 + }, + { + "epoch": 1.8753709198813056, + "grad_norm": 0.5469589233398438, + "learning_rate": 1e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7662793397903442, + "num_tokens": 1514626976.0, + "step": 3160 + }, + { + "epoch": 1.8759643916913946, + "grad_norm": 0.5281570553779602, + "learning_rate": 1e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.7852228879928589, + "num_tokens": 1515129226.0, + "step": 3161 + }, + { + "epoch": 1.8765578635014837, + "grad_norm": 0.5186581015586853, + "learning_rate": 1e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7767184972763062, + "num_tokens": 1515666970.0, + "step": 3162 + }, + { + "epoch": 1.8771513353115727, + "grad_norm": 0.578204870223999, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.769963800907135, + "num_tokens": 1516128936.0, + "step": 3163 + }, + { + "epoch": 1.8777448071216618, + "grad_norm": 0.5658992528915405, + "learning_rate": 1e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.7734583616256714, + "num_tokens": 1516542752.0, + "step": 3164 + }, + { + "epoch": 1.8783382789317509, + "grad_norm": 0.5244554281234741, + "learning_rate": 1e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7619573473930359, + "num_tokens": 1517029101.0, + "step": 3165 + }, + { + "epoch": 1.8789317507418397, + "grad_norm": 0.5588635802268982, + "learning_rate": 1e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7756761312484741, + "num_tokens": 1517470608.0, + "step": 3166 + }, + { + "epoch": 1.8795252225519288, + "grad_norm": 0.5703470706939697, + "learning_rate": 1e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7630095481872559, + "num_tokens": 1517916139.0, + "step": 3167 + }, + { + "epoch": 1.8801186943620178, + "grad_norm": 0.5126031041145325, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7570127248764038, + "num_tokens": 1518424928.0, + "step": 3168 + }, + { + "epoch": 1.8807121661721067, + "grad_norm": 0.5630404949188232, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7571734189987183, + "num_tokens": 1518872186.0, + "step": 3169 + }, + { + "epoch": 1.8813056379821957, + "grad_norm": 0.5630547404289246, + "learning_rate": 1e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.7593516111373901, + "num_tokens": 1519346721.0, + "step": 3170 + }, + { + "epoch": 1.8818991097922848, + "grad_norm": 0.5490485429763794, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7570834159851074, + "num_tokens": 1519833738.0, + "step": 3171 + }, + { + "epoch": 1.8824925816023739, + "grad_norm": 0.5061565041542053, + "learning_rate": 1e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.770508885383606, + "num_tokens": 1520370759.0, + "step": 3172 + }, + { + "epoch": 1.883086053412463, + "grad_norm": 0.5432368516921997, + "learning_rate": 1e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7538735270500183, + "num_tokens": 1520843964.0, + "step": 3173 + }, + { + "epoch": 1.883679525222552, + "grad_norm": 0.5392880439758301, + "learning_rate": 1e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7841333150863647, + "num_tokens": 1521310146.0, + "step": 3174 + }, + { + "epoch": 1.884272997032641, + "grad_norm": 0.5467761158943176, + "learning_rate": 1e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.7732664942741394, + "num_tokens": 1521777509.0, + "step": 3175 + }, + { + "epoch": 1.8848664688427301, + "grad_norm": 0.552226722240448, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7631610631942749, + "num_tokens": 1522284622.0, + "step": 3176 + }, + { + "epoch": 1.885459940652819, + "grad_norm": 0.5171200037002563, + "learning_rate": 1e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7737718820571899, + "num_tokens": 1522768440.0, + "step": 3177 + }, + { + "epoch": 1.886053412462908, + "grad_norm": 0.5710840225219727, + "learning_rate": 1e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7731428146362305, + "num_tokens": 1523210472.0, + "step": 3178 + }, + { + "epoch": 1.8866468842729969, + "grad_norm": 0.5183391571044922, + "learning_rate": 1e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7798312902450562, + "num_tokens": 1523686477.0, + "step": 3179 + }, + { + "epoch": 1.887240356083086, + "grad_norm": 0.48936423659324646, + "learning_rate": 1e-06, + "loss": 0.672, + "mean_token_accuracy": 0.7830229997634888, + "num_tokens": 1524228687.0, + "step": 3180 + }, + { + "epoch": 1.887833827893175, + "grad_norm": 0.530580997467041, + "learning_rate": 1e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7741739153862, + "num_tokens": 1524730494.0, + "step": 3181 + }, + { + "epoch": 1.888427299703264, + "grad_norm": 0.5618261694908142, + "learning_rate": 1e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7671836018562317, + "num_tokens": 1525153203.0, + "step": 3182 + }, + { + "epoch": 1.8890207715133531, + "grad_norm": 0.5547890067100525, + "learning_rate": 1e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7716207504272461, + "num_tokens": 1525631992.0, + "step": 3183 + }, + { + "epoch": 1.8896142433234422, + "grad_norm": 0.5350112915039062, + "learning_rate": 1e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.765916109085083, + "num_tokens": 1526129723.0, + "step": 3184 + }, + { + "epoch": 1.8902077151335313, + "grad_norm": 0.5357826352119446, + "learning_rate": 1e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7680631875991821, + "num_tokens": 1526611473.0, + "step": 3185 + }, + { + "epoch": 1.8908011869436203, + "grad_norm": 0.5415029525756836, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7606547474861145, + "num_tokens": 1527079337.0, + "step": 3186 + }, + { + "epoch": 1.8913946587537092, + "grad_norm": 0.5152459144592285, + "learning_rate": 1e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7764670252799988, + "num_tokens": 1527562611.0, + "step": 3187 + }, + { + "epoch": 1.8919881305637982, + "grad_norm": 0.5489065647125244, + "learning_rate": 1e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.765390157699585, + "num_tokens": 1528041746.0, + "step": 3188 + }, + { + "epoch": 1.8925816023738873, + "grad_norm": 0.5786659717559814, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7554980516433716, + "num_tokens": 1528519946.0, + "step": 3189 + }, + { + "epoch": 1.8931750741839761, + "grad_norm": 0.5365166664123535, + "learning_rate": 1e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7724183201789856, + "num_tokens": 1529038420.0, + "step": 3190 + }, + { + "epoch": 1.8937685459940652, + "grad_norm": 0.5470101237297058, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7607335448265076, + "num_tokens": 1529521571.0, + "step": 3191 + }, + { + "epoch": 1.8943620178041543, + "grad_norm": 0.5480765700340271, + "learning_rate": 1e-06, + "loss": 0.6923, + "mean_token_accuracy": 0.778877317905426, + "num_tokens": 1530012942.0, + "step": 3192 + }, + { + "epoch": 1.8949554896142433, + "grad_norm": 0.5396052598953247, + "learning_rate": 1e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.7621318101882935, + "num_tokens": 1530471397.0, + "step": 3193 + }, + { + "epoch": 1.8955489614243324, + "grad_norm": 0.5055142641067505, + "learning_rate": 1e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7779645919799805, + "num_tokens": 1531037656.0, + "step": 3194 + }, + { + "epoch": 1.8961424332344214, + "grad_norm": 0.563191831111908, + "learning_rate": 1e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7720967531204224, + "num_tokens": 1531526544.0, + "step": 3195 + }, + { + "epoch": 1.8967359050445105, + "grad_norm": 0.5540051460266113, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7564287185668945, + "num_tokens": 1532000277.0, + "step": 3196 + }, + { + "epoch": 1.8973293768545996, + "grad_norm": 0.5005546808242798, + "learning_rate": 1e-06, + "loss": 0.6693, + "mean_token_accuracy": 0.7842466831207275, + "num_tokens": 1532524346.0, + "step": 3197 + }, + { + "epoch": 1.8979228486646884, + "grad_norm": 0.5483164191246033, + "learning_rate": 1e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7583772540092468, + "num_tokens": 1533001752.0, + "step": 3198 + }, + { + "epoch": 1.8985163204747775, + "grad_norm": 0.6155997514724731, + "learning_rate": 1e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7675189971923828, + "num_tokens": 1533448671.0, + "step": 3199 + }, + { + "epoch": 1.8991097922848663, + "grad_norm": 0.537531316280365, + "learning_rate": 1e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7697018384933472, + "num_tokens": 1533917093.0, + "step": 3200 + }, + { + "epoch": 1.8997032640949554, + "grad_norm": 0.5298060178756714, + "learning_rate": 1e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.7611345648765564, + "num_tokens": 1534438025.0, + "step": 3201 + }, + { + "epoch": 1.9002967359050444, + "grad_norm": 0.5474129915237427, + "learning_rate": 1e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.777739405632019, + "num_tokens": 1534919213.0, + "step": 3202 + }, + { + "epoch": 1.9008902077151335, + "grad_norm": 0.5710395574569702, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7673026323318481, + "num_tokens": 1535389870.0, + "step": 3203 + }, + { + "epoch": 1.9014836795252226, + "grad_norm": 0.5489405393600464, + "learning_rate": 1e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.7668052911758423, + "num_tokens": 1535831319.0, + "step": 3204 + }, + { + "epoch": 1.9020771513353116, + "grad_norm": 0.5590475797653198, + "learning_rate": 1e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7665036916732788, + "num_tokens": 1536294706.0, + "step": 3205 + }, + { + "epoch": 1.9026706231454007, + "grad_norm": 0.523039698600769, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7646973729133606, + "num_tokens": 1536772898.0, + "step": 3206 + }, + { + "epoch": 1.9032640949554898, + "grad_norm": 0.6045195460319519, + "learning_rate": 1e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7491529583930969, + "num_tokens": 1537220483.0, + "step": 3207 + }, + { + "epoch": 1.9038575667655786, + "grad_norm": 0.5568854808807373, + "learning_rate": 1e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7691382169723511, + "num_tokens": 1537681705.0, + "step": 3208 + }, + { + "epoch": 1.9044510385756677, + "grad_norm": 0.5402199625968933, + "learning_rate": 1e-06, + "loss": 0.6946, + "mean_token_accuracy": 0.7762134075164795, + "num_tokens": 1538160830.0, + "step": 3209 + }, + { + "epoch": 1.9050445103857567, + "grad_norm": 0.5091651678085327, + "learning_rate": 1e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.7677274346351624, + "num_tokens": 1538675035.0, + "step": 3210 + }, + { + "epoch": 1.9056379821958456, + "grad_norm": 0.5559536218643188, + "learning_rate": 1e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7616211175918579, + "num_tokens": 1539148616.0, + "step": 3211 + }, + { + "epoch": 1.9062314540059346, + "grad_norm": 0.5535804033279419, + "learning_rate": 1e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7666984796524048, + "num_tokens": 1539672254.0, + "step": 3212 + }, + { + "epoch": 1.9068249258160237, + "grad_norm": 0.5572757720947266, + "learning_rate": 1e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.7732269763946533, + "num_tokens": 1540122567.0, + "step": 3213 + }, + { + "epoch": 1.9074183976261128, + "grad_norm": 0.5204413533210754, + "learning_rate": 1e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.7822780609130859, + "num_tokens": 1540621242.0, + "step": 3214 + }, + { + "epoch": 1.9080118694362018, + "grad_norm": 0.5938486456871033, + "learning_rate": 1e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7755404710769653, + "num_tokens": 1541088744.0, + "step": 3215 + }, + { + "epoch": 1.9086053412462909, + "grad_norm": 0.57811039686203, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7725411057472229, + "num_tokens": 1541546858.0, + "step": 3216 + }, + { + "epoch": 1.90919881305638, + "grad_norm": 0.5907584428787231, + "learning_rate": 1e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7653558254241943, + "num_tokens": 1541982725.0, + "step": 3217 + }, + { + "epoch": 1.9097922848664688, + "grad_norm": 0.5286505222320557, + "learning_rate": 1e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7659933567047119, + "num_tokens": 1542513501.0, + "step": 3218 + }, + { + "epoch": 1.9103857566765579, + "grad_norm": 0.5599446296691895, + "learning_rate": 1e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.775856614112854, + "num_tokens": 1542957240.0, + "step": 3219 + }, + { + "epoch": 1.910979228486647, + "grad_norm": 0.5239474177360535, + "learning_rate": 1e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7612296342849731, + "num_tokens": 1543453574.0, + "step": 3220 + }, + { + "epoch": 1.9115727002967358, + "grad_norm": 0.5643782019615173, + "learning_rate": 1e-06, + "loss": 0.7079, + "mean_token_accuracy": 0.7724878787994385, + "num_tokens": 1543889431.0, + "step": 3221 + }, + { + "epoch": 1.9121661721068248, + "grad_norm": 0.5477997064590454, + "learning_rate": 1e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.7787203788757324, + "num_tokens": 1544389012.0, + "step": 3222 + }, + { + "epoch": 1.9127596439169139, + "grad_norm": 0.5549737215042114, + "learning_rate": 1e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.7592447996139526, + "num_tokens": 1544825955.0, + "step": 3223 + }, + { + "epoch": 1.913353115727003, + "grad_norm": 0.5600978136062622, + "learning_rate": 1e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7675176858901978, + "num_tokens": 1545337371.0, + "step": 3224 + }, + { + "epoch": 1.913946587537092, + "grad_norm": 0.5711067914962769, + "learning_rate": 1e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.765112578868866, + "num_tokens": 1545813021.0, + "step": 3225 + }, + { + "epoch": 1.914540059347181, + "grad_norm": 0.5572082996368408, + "learning_rate": 1e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7677904963493347, + "num_tokens": 1546269629.0, + "step": 3226 + }, + { + "epoch": 1.9151335311572701, + "grad_norm": 0.5446337461471558, + "learning_rate": 1e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7578845024108887, + "num_tokens": 1546742657.0, + "step": 3227 + }, + { + "epoch": 1.9157270029673592, + "grad_norm": 0.5686119198799133, + "learning_rate": 1e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.775265634059906, + "num_tokens": 1547199197.0, + "step": 3228 + }, + { + "epoch": 1.916320474777448, + "grad_norm": 0.5416328310966492, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7599624395370483, + "num_tokens": 1547685590.0, + "step": 3229 + }, + { + "epoch": 1.916913946587537, + "grad_norm": 0.5302554368972778, + "learning_rate": 1e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7770501375198364, + "num_tokens": 1548176838.0, + "step": 3230 + }, + { + "epoch": 1.917507418397626, + "grad_norm": 0.5506988763809204, + "learning_rate": 1e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7714593410491943, + "num_tokens": 1548677254.0, + "step": 3231 + }, + { + "epoch": 1.918100890207715, + "grad_norm": 0.5522232055664062, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7603826522827148, + "num_tokens": 1549131415.0, + "step": 3232 + }, + { + "epoch": 1.918694362017804, + "grad_norm": 0.5438216328620911, + "learning_rate": 1e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.752957284450531, + "num_tokens": 1549603175.0, + "step": 3233 + }, + { + "epoch": 1.9192878338278931, + "grad_norm": 0.5972796082496643, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7633814811706543, + "num_tokens": 1550037977.0, + "step": 3234 + }, + { + "epoch": 1.9198813056379822, + "grad_norm": 0.5616275072097778, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7655137777328491, + "num_tokens": 1550508193.0, + "step": 3235 + }, + { + "epoch": 1.9204747774480713, + "grad_norm": 0.5128360986709595, + "learning_rate": 1e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7681300640106201, + "num_tokens": 1551017151.0, + "step": 3236 + }, + { + "epoch": 1.9210682492581603, + "grad_norm": 0.516160249710083, + "learning_rate": 1e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7680922746658325, + "num_tokens": 1551503405.0, + "step": 3237 + }, + { + "epoch": 1.9216617210682494, + "grad_norm": 0.5600342750549316, + "learning_rate": 1e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7699755430221558, + "num_tokens": 1551965729.0, + "step": 3238 + }, + { + "epoch": 1.9222551928783382, + "grad_norm": 0.5639526844024658, + "learning_rate": 1e-06, + "loss": 0.6981, + "mean_token_accuracy": 0.7754090428352356, + "num_tokens": 1552452752.0, + "step": 3239 + }, + { + "epoch": 1.9228486646884273, + "grad_norm": 0.5767248868942261, + "learning_rate": 1e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7503182888031006, + "num_tokens": 1552925991.0, + "step": 3240 + }, + { + "epoch": 1.9234421364985164, + "grad_norm": 0.5340412259101868, + "learning_rate": 1e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7713227272033691, + "num_tokens": 1553449880.0, + "step": 3241 + }, + { + "epoch": 1.9240356083086052, + "grad_norm": 0.5457296371459961, + "learning_rate": 1e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.7805256843566895, + "num_tokens": 1553925427.0, + "step": 3242 + }, + { + "epoch": 1.9246290801186943, + "grad_norm": 0.5758295655250549, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7655301690101624, + "num_tokens": 1554360481.0, + "step": 3243 + }, + { + "epoch": 1.9252225519287833, + "grad_norm": 0.5828122496604919, + "learning_rate": 1e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7606059908866882, + "num_tokens": 1554818462.0, + "step": 3244 + }, + { + "epoch": 1.9258160237388724, + "grad_norm": 0.5689891576766968, + "learning_rate": 1e-06, + "loss": 0.6947, + "mean_token_accuracy": 0.7811437845230103, + "num_tokens": 1555321504.0, + "step": 3245 + }, + { + "epoch": 1.9264094955489615, + "grad_norm": 0.545617401599884, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7725921869277954, + "num_tokens": 1555781135.0, + "step": 3246 + }, + { + "epoch": 1.9270029673590505, + "grad_norm": 0.5514024496078491, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7683866024017334, + "num_tokens": 1556248973.0, + "step": 3247 + }, + { + "epoch": 1.9275964391691396, + "grad_norm": 0.5629300475120544, + "learning_rate": 1e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7705830335617065, + "num_tokens": 1556718062.0, + "step": 3248 + }, + { + "epoch": 1.9281899109792286, + "grad_norm": 0.5499588847160339, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7553727626800537, + "num_tokens": 1557212943.0, + "step": 3249 + }, + { + "epoch": 1.9287833827893175, + "grad_norm": 0.5435228943824768, + "learning_rate": 1e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7758073806762695, + "num_tokens": 1557665798.0, + "step": 3250 + }, + { + "epoch": 1.9293768545994066, + "grad_norm": 0.5699474811553955, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7669976949691772, + "num_tokens": 1558143577.0, + "step": 3251 + }, + { + "epoch": 1.9299703264094954, + "grad_norm": 0.5359399318695068, + "learning_rate": 1e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7702011466026306, + "num_tokens": 1558620056.0, + "step": 3252 + }, + { + "epoch": 1.9305637982195845, + "grad_norm": 0.5273976922035217, + "learning_rate": 1e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.765985906124115, + "num_tokens": 1559101752.0, + "step": 3253 + }, + { + "epoch": 1.9311572700296735, + "grad_norm": 0.6006179451942444, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7664971351623535, + "num_tokens": 1559546514.0, + "step": 3254 + }, + { + "epoch": 1.9317507418397626, + "grad_norm": 0.6014372110366821, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7592223882675171, + "num_tokens": 1560045176.0, + "step": 3255 + }, + { + "epoch": 1.9323442136498516, + "grad_norm": 0.5258470177650452, + "learning_rate": 1e-06, + "loss": 0.6842, + "mean_token_accuracy": 0.7799035906791687, + "num_tokens": 1560507727.0, + "step": 3256 + }, + { + "epoch": 1.9329376854599407, + "grad_norm": 0.5732410550117493, + "learning_rate": 1e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.77527916431427, + "num_tokens": 1560970522.0, + "step": 3257 + }, + { + "epoch": 1.9335311572700298, + "grad_norm": 0.5802286267280579, + "learning_rate": 1e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.775076150894165, + "num_tokens": 1561424015.0, + "step": 3258 + }, + { + "epoch": 1.9341246290801188, + "grad_norm": 0.5475548505783081, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7667019963264465, + "num_tokens": 1561870771.0, + "step": 3259 + }, + { + "epoch": 1.9347181008902077, + "grad_norm": 0.536341667175293, + "learning_rate": 1e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.7751632928848267, + "num_tokens": 1562312873.0, + "step": 3260 + }, + { + "epoch": 1.9353115727002967, + "grad_norm": 0.530232310295105, + "learning_rate": 1e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.772150993347168, + "num_tokens": 1562816287.0, + "step": 3261 + }, + { + "epoch": 1.9359050445103858, + "grad_norm": 0.5457180738449097, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7602604627609253, + "num_tokens": 1563335170.0, + "step": 3262 + }, + { + "epoch": 1.9364985163204746, + "grad_norm": 0.5790969729423523, + "learning_rate": 1e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7742383480072021, + "num_tokens": 1563824372.0, + "step": 3263 + }, + { + "epoch": 1.9370919881305637, + "grad_norm": 0.5686534643173218, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7638593316078186, + "num_tokens": 1564262730.0, + "step": 3264 + }, + { + "epoch": 1.9376854599406528, + "grad_norm": 0.5228031277656555, + "learning_rate": 1e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7616558074951172, + "num_tokens": 1564766292.0, + "step": 3265 + }, + { + "epoch": 1.9382789317507418, + "grad_norm": 0.5733746886253357, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.7651359438896179, + "num_tokens": 1565226127.0, + "step": 3266 + }, + { + "epoch": 1.938872403560831, + "grad_norm": 0.6077466011047363, + "learning_rate": 1e-06, + "loss": 0.6776, + "mean_token_accuracy": 0.7818248271942139, + "num_tokens": 1565699027.0, + "step": 3267 + }, + { + "epoch": 1.93946587537092, + "grad_norm": 0.5544946193695068, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7468245029449463, + "num_tokens": 1566143052.0, + "step": 3268 + }, + { + "epoch": 1.940059347181009, + "grad_norm": 0.5729820728302002, + "learning_rate": 1e-06, + "loss": 0.7334, + "mean_token_accuracy": 0.7665952444076538, + "num_tokens": 1566644424.0, + "step": 3269 + }, + { + "epoch": 1.9406528189910979, + "grad_norm": 0.5437834858894348, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7683470249176025, + "num_tokens": 1567148226.0, + "step": 3270 + }, + { + "epoch": 1.941246290801187, + "grad_norm": 0.5547215342521667, + "learning_rate": 1e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7648705840110779, + "num_tokens": 1567613446.0, + "step": 3271 + }, + { + "epoch": 1.941839762611276, + "grad_norm": 0.5402689576148987, + "learning_rate": 1e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7680906653404236, + "num_tokens": 1568095940.0, + "step": 3272 + }, + { + "epoch": 1.9424332344213648, + "grad_norm": 0.5596492886543274, + "learning_rate": 1e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7524975538253784, + "num_tokens": 1568527273.0, + "step": 3273 + }, + { + "epoch": 1.943026706231454, + "grad_norm": 0.5616393089294434, + "learning_rate": 1e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.7708218693733215, + "num_tokens": 1568993508.0, + "step": 3274 + }, + { + "epoch": 1.943620178041543, + "grad_norm": 0.5135066509246826, + "learning_rate": 1e-06, + "loss": 0.6866, + "mean_token_accuracy": 0.7799537181854248, + "num_tokens": 1569498278.0, + "step": 3275 + }, + { + "epoch": 1.944213649851632, + "grad_norm": 0.5170163512229919, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7684149742126465, + "num_tokens": 1569992832.0, + "step": 3276 + }, + { + "epoch": 1.944807121661721, + "grad_norm": 0.5214649438858032, + "learning_rate": 1e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7761076092720032, + "num_tokens": 1570475026.0, + "step": 3277 + }, + { + "epoch": 1.9454005934718102, + "grad_norm": 0.5451305508613586, + "learning_rate": 1e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.7798460721969604, + "num_tokens": 1570952503.0, + "step": 3278 + }, + { + "epoch": 1.9459940652818992, + "grad_norm": 0.546886682510376, + "learning_rate": 1e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7717945575714111, + "num_tokens": 1571422252.0, + "step": 3279 + }, + { + "epoch": 1.9465875370919883, + "grad_norm": 0.5405285358428955, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7698880434036255, + "num_tokens": 1571892657.0, + "step": 3280 + }, + { + "epoch": 1.9471810089020771, + "grad_norm": 0.570145308971405, + "learning_rate": 1e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7665566205978394, + "num_tokens": 1572375133.0, + "step": 3281 + }, + { + "epoch": 1.9477744807121662, + "grad_norm": 0.5680491328239441, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7670156955718994, + "num_tokens": 1572834318.0, + "step": 3282 + }, + { + "epoch": 1.948367952522255, + "grad_norm": 0.5623341798782349, + "learning_rate": 1e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7748925685882568, + "num_tokens": 1573303178.0, + "step": 3283 + }, + { + "epoch": 1.948961424332344, + "grad_norm": 0.5541077256202698, + "learning_rate": 1e-06, + "loss": 0.774, + "mean_token_accuracy": 0.755001425743103, + "num_tokens": 1573784183.0, + "step": 3284 + }, + { + "epoch": 1.9495548961424332, + "grad_norm": 0.5431365370750427, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.768891453742981, + "num_tokens": 1574252648.0, + "step": 3285 + }, + { + "epoch": 1.9501483679525222, + "grad_norm": 0.5523958802223206, + "learning_rate": 1e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7647762894630432, + "num_tokens": 1574705828.0, + "step": 3286 + }, + { + "epoch": 1.9507418397626113, + "grad_norm": 0.5338670611381531, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7640013694763184, + "num_tokens": 1575179107.0, + "step": 3287 + }, + { + "epoch": 1.9513353115727003, + "grad_norm": 0.5512681007385254, + "learning_rate": 1e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7688701152801514, + "num_tokens": 1575633922.0, + "step": 3288 + }, + { + "epoch": 1.9519287833827894, + "grad_norm": 0.6016425490379333, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7745294570922852, + "num_tokens": 1576112627.0, + "step": 3289 + }, + { + "epoch": 1.9525222551928785, + "grad_norm": 0.5492302775382996, + "learning_rate": 1e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.7654860615730286, + "num_tokens": 1576593754.0, + "step": 3290 + }, + { + "epoch": 1.9531157270029673, + "grad_norm": 0.5599586963653564, + "learning_rate": 1e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7595439553260803, + "num_tokens": 1577038250.0, + "step": 3291 + }, + { + "epoch": 1.9537091988130564, + "grad_norm": 0.5590896010398865, + "learning_rate": 1e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.7667410969734192, + "num_tokens": 1577526229.0, + "step": 3292 + }, + { + "epoch": 1.9543026706231454, + "grad_norm": 0.5622943639755249, + "learning_rate": 1e-06, + "loss": 0.79, + "mean_token_accuracy": 0.7514129877090454, + "num_tokens": 1578003188.0, + "step": 3293 + }, + { + "epoch": 1.9548961424332343, + "grad_norm": 0.5374332666397095, + "learning_rate": 1e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7673836946487427, + "num_tokens": 1578490166.0, + "step": 3294 + }, + { + "epoch": 1.9554896142433233, + "grad_norm": 0.5471683740615845, + "learning_rate": 1e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7778626680374146, + "num_tokens": 1578965247.0, + "step": 3295 + }, + { + "epoch": 1.9560830860534124, + "grad_norm": 0.558612048625946, + "learning_rate": 1e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7730998992919922, + "num_tokens": 1579449975.0, + "step": 3296 + }, + { + "epoch": 1.9566765578635015, + "grad_norm": 0.589552640914917, + "learning_rate": 1e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7556272745132446, + "num_tokens": 1579949308.0, + "step": 3297 + }, + { + "epoch": 1.9572700296735905, + "grad_norm": 0.523547351360321, + "learning_rate": 1e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7663595080375671, + "num_tokens": 1580422214.0, + "step": 3298 + }, + { + "epoch": 1.9578635014836796, + "grad_norm": 0.5156693458557129, + "learning_rate": 1e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7733081579208374, + "num_tokens": 1580932181.0, + "step": 3299 + }, + { + "epoch": 1.9584569732937687, + "grad_norm": 0.5571773052215576, + "learning_rate": 1e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7727607488632202, + "num_tokens": 1581390534.0, + "step": 3300 + }, + { + "epoch": 1.9590504451038577, + "grad_norm": 0.5884343981742859, + "learning_rate": 1e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7509480118751526, + "num_tokens": 1581857923.0, + "step": 3301 + }, + { + "epoch": 1.9596439169139466, + "grad_norm": 0.5226216912269592, + "learning_rate": 1e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7665490508079529, + "num_tokens": 1582387924.0, + "step": 3302 + }, + { + "epoch": 1.9602373887240356, + "grad_norm": 0.523737370967865, + "learning_rate": 1e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7762953639030457, + "num_tokens": 1582916016.0, + "step": 3303 + }, + { + "epoch": 1.9608308605341245, + "grad_norm": 0.5272903442382812, + "learning_rate": 1e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7698409557342529, + "num_tokens": 1583438344.0, + "step": 3304 + }, + { + "epoch": 1.9614243323442135, + "grad_norm": 0.5290464162826538, + "learning_rate": 1e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7731137275695801, + "num_tokens": 1583935905.0, + "step": 3305 + }, + { + "epoch": 1.9620178041543026, + "grad_norm": 0.5742356181144714, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7582827806472778, + "num_tokens": 1584372322.0, + "step": 3306 + }, + { + "epoch": 1.9626112759643917, + "grad_norm": 0.5689401030540466, + "learning_rate": 1e-06, + "loss": 0.6699, + "mean_token_accuracy": 0.7852311134338379, + "num_tokens": 1584807597.0, + "step": 3307 + }, + { + "epoch": 1.9632047477744807, + "grad_norm": 0.5106021165847778, + "learning_rate": 1e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7651584148406982, + "num_tokens": 1585349557.0, + "step": 3308 + }, + { + "epoch": 1.9637982195845698, + "grad_norm": 0.5056979656219482, + "learning_rate": 1e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.7669347524642944, + "num_tokens": 1585861083.0, + "step": 3309 + }, + { + "epoch": 1.9643916913946589, + "grad_norm": 0.5699738264083862, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7701951265335083, + "num_tokens": 1586292133.0, + "step": 3310 + }, + { + "epoch": 1.964985163204748, + "grad_norm": 0.5316281318664551, + "learning_rate": 1e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.7815817594528198, + "num_tokens": 1586808830.0, + "step": 3311 + }, + { + "epoch": 1.9655786350148368, + "grad_norm": 0.5500369668006897, + "learning_rate": 1e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7621779441833496, + "num_tokens": 1587269150.0, + "step": 3312 + }, + { + "epoch": 1.9661721068249258, + "grad_norm": 0.5116422772407532, + "learning_rate": 1e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7593234181404114, + "num_tokens": 1587775255.0, + "step": 3313 + }, + { + "epoch": 1.9667655786350149, + "grad_norm": 0.512076199054718, + "learning_rate": 1e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7626552581787109, + "num_tokens": 1588287793.0, + "step": 3314 + }, + { + "epoch": 1.9673590504451037, + "grad_norm": 0.5427932739257812, + "learning_rate": 1e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.7782821655273438, + "num_tokens": 1588723883.0, + "step": 3315 + }, + { + "epoch": 1.9679525222551928, + "grad_norm": 0.5598021149635315, + "learning_rate": 1e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7693625688552856, + "num_tokens": 1589155946.0, + "step": 3316 + }, + { + "epoch": 1.9685459940652819, + "grad_norm": 0.5421726703643799, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7733221054077148, + "num_tokens": 1589607551.0, + "step": 3317 + }, + { + "epoch": 1.969139465875371, + "grad_norm": 0.5545188188552856, + "learning_rate": 1e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7571009993553162, + "num_tokens": 1590110073.0, + "step": 3318 + }, + { + "epoch": 1.96973293768546, + "grad_norm": 0.5308838486671448, + "learning_rate": 1e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7742747664451599, + "num_tokens": 1590599793.0, + "step": 3319 + }, + { + "epoch": 1.970326409495549, + "grad_norm": 0.5413599610328674, + "learning_rate": 1e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.7812054753303528, + "num_tokens": 1591067196.0, + "step": 3320 + }, + { + "epoch": 1.970919881305638, + "grad_norm": 0.5290445685386658, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.769167959690094, + "num_tokens": 1591570430.0, + "step": 3321 + }, + { + "epoch": 1.971513353115727, + "grad_norm": 0.5127536654472351, + "learning_rate": 1e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7737802863121033, + "num_tokens": 1592083420.0, + "step": 3322 + }, + { + "epoch": 1.972106824925816, + "grad_norm": 0.5307551622390747, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7707632780075073, + "num_tokens": 1592564273.0, + "step": 3323 + }, + { + "epoch": 1.972700296735905, + "grad_norm": 0.5142926573753357, + "learning_rate": 1e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7732976675033569, + "num_tokens": 1593027005.0, + "step": 3324 + }, + { + "epoch": 1.973293768545994, + "grad_norm": 0.5297878384590149, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.766937255859375, + "num_tokens": 1593505183.0, + "step": 3325 + }, + { + "epoch": 1.973887240356083, + "grad_norm": 0.5095561742782593, + "learning_rate": 1e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7760426998138428, + "num_tokens": 1594000264.0, + "step": 3326 + }, + { + "epoch": 1.974480712166172, + "grad_norm": 0.5412817001342773, + "learning_rate": 1e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7610995769500732, + "num_tokens": 1594473683.0, + "step": 3327 + }, + { + "epoch": 1.975074183976261, + "grad_norm": 0.519633948802948, + "learning_rate": 1e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.7698227763175964, + "num_tokens": 1594998300.0, + "step": 3328 + }, + { + "epoch": 1.9756676557863502, + "grad_norm": 0.499070942401886, + "learning_rate": 1e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7795078158378601, + "num_tokens": 1595507470.0, + "step": 3329 + }, + { + "epoch": 1.9762611275964392, + "grad_norm": 0.5699664354324341, + "learning_rate": 1e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7607846260070801, + "num_tokens": 1595948663.0, + "step": 3330 + }, + { + "epoch": 1.9768545994065283, + "grad_norm": 0.5249819755554199, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7631638050079346, + "num_tokens": 1596425625.0, + "step": 3331 + }, + { + "epoch": 1.9774480712166174, + "grad_norm": 0.5359465479850769, + "learning_rate": 1e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7742022275924683, + "num_tokens": 1596897270.0, + "step": 3332 + }, + { + "epoch": 1.9780415430267062, + "grad_norm": 0.5196255445480347, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7622084617614746, + "num_tokens": 1597389615.0, + "step": 3333 + }, + { + "epoch": 1.9786350148367953, + "grad_norm": 0.5590579509735107, + "learning_rate": 1e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.762665867805481, + "num_tokens": 1597913206.0, + "step": 3334 + }, + { + "epoch": 1.979228486646884, + "grad_norm": 0.521112859249115, + "learning_rate": 1e-06, + "loss": 0.679, + "mean_token_accuracy": 0.7824981212615967, + "num_tokens": 1598419998.0, + "step": 3335 + }, + { + "epoch": 1.9798219584569732, + "grad_norm": 0.5374521613121033, + "learning_rate": 1e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7738168835639954, + "num_tokens": 1598924572.0, + "step": 3336 + }, + { + "epoch": 1.9804154302670622, + "grad_norm": 0.5449469685554504, + "learning_rate": 1e-06, + "loss": 0.6757, + "mean_token_accuracy": 0.7837156057357788, + "num_tokens": 1599423410.0, + "step": 3337 + }, + { + "epoch": 1.9810089020771513, + "grad_norm": 0.5419023036956787, + "learning_rate": 1e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7652604579925537, + "num_tokens": 1599908605.0, + "step": 3338 + }, + { + "epoch": 1.9816023738872404, + "grad_norm": 0.5554314255714417, + "learning_rate": 1e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7500758171081543, + "num_tokens": 1600353027.0, + "step": 3339 + }, + { + "epoch": 1.9821958456973294, + "grad_norm": 0.5400270819664001, + "learning_rate": 1e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7793881297111511, + "num_tokens": 1600846253.0, + "step": 3340 + }, + { + "epoch": 1.9827893175074185, + "grad_norm": 0.5674182176589966, + "learning_rate": 1e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.7785502672195435, + "num_tokens": 1601322606.0, + "step": 3341 + }, + { + "epoch": 1.9833827893175076, + "grad_norm": 0.5044947266578674, + "learning_rate": 1e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7707194685935974, + "num_tokens": 1601844892.0, + "step": 3342 + }, + { + "epoch": 1.9839762611275964, + "grad_norm": 0.5500999689102173, + "learning_rate": 1e-06, + "loss": 0.79, + "mean_token_accuracy": 0.7543444037437439, + "num_tokens": 1602324205.0, + "step": 3343 + }, + { + "epoch": 1.9845697329376855, + "grad_norm": 0.5416202545166016, + "learning_rate": 1e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7755951881408691, + "num_tokens": 1602816629.0, + "step": 3344 + }, + { + "epoch": 1.9851632047477745, + "grad_norm": 0.568006157875061, + "learning_rate": 1e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7714720368385315, + "num_tokens": 1603246855.0, + "step": 3345 + }, + { + "epoch": 1.9857566765578634, + "grad_norm": 0.5526055693626404, + "learning_rate": 1e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7553669214248657, + "num_tokens": 1603711435.0, + "step": 3346 + }, + { + "epoch": 1.9863501483679524, + "grad_norm": 0.536434531211853, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.770865797996521, + "num_tokens": 1604233499.0, + "step": 3347 + }, + { + "epoch": 1.9869436201780415, + "grad_norm": 0.5361291170120239, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7735717296600342, + "num_tokens": 1604702594.0, + "step": 3348 + }, + { + "epoch": 1.9875370919881306, + "grad_norm": 0.525577962398529, + "learning_rate": 1e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7656232118606567, + "num_tokens": 1605199311.0, + "step": 3349 + }, + { + "epoch": 1.9881305637982196, + "grad_norm": 0.5445780754089355, + "learning_rate": 1e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.765581488609314, + "num_tokens": 1605684061.0, + "step": 3350 + }, + { + "epoch": 1.9887240356083087, + "grad_norm": 0.5450751185417175, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7753992080688477, + "num_tokens": 1606162154.0, + "step": 3351 + }, + { + "epoch": 1.9893175074183977, + "grad_norm": 0.5589345097541809, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7754824161529541, + "num_tokens": 1606611946.0, + "step": 3352 + }, + { + "epoch": 1.9899109792284868, + "grad_norm": 0.5482579469680786, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7613829374313354, + "num_tokens": 1607089501.0, + "step": 3353 + }, + { + "epoch": 1.9905044510385757, + "grad_norm": 0.5569537281990051, + "learning_rate": 1e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7700414657592773, + "num_tokens": 1607526381.0, + "step": 3354 + }, + { + "epoch": 1.9910979228486647, + "grad_norm": 0.5440525412559509, + "learning_rate": 1e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7515685558319092, + "num_tokens": 1608008523.0, + "step": 3355 + }, + { + "epoch": 1.9916913946587536, + "grad_norm": 0.5107528567314148, + "learning_rate": 1e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7760805487632751, + "num_tokens": 1608520880.0, + "step": 3356 + }, + { + "epoch": 1.9922848664688426, + "grad_norm": 0.534827709197998, + "learning_rate": 1e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7695654630661011, + "num_tokens": 1609013614.0, + "step": 3357 + }, + { + "epoch": 1.9928783382789317, + "grad_norm": 0.5762607455253601, + "learning_rate": 1e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7570350766181946, + "num_tokens": 1609452540.0, + "step": 3358 + }, + { + "epoch": 1.9934718100890207, + "grad_norm": 0.5284684300422668, + "learning_rate": 1e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7586642503738403, + "num_tokens": 1609909941.0, + "step": 3359 + }, + { + "epoch": 1.9940652818991098, + "grad_norm": 0.5086534023284912, + "learning_rate": 1e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.77318274974823, + "num_tokens": 1610434781.0, + "step": 3360 + }, + { + "epoch": 1.9946587537091989, + "grad_norm": 0.567945659160614, + "learning_rate": 1e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.751548707485199, + "num_tokens": 1610910929.0, + "step": 3361 + }, + { + "epoch": 1.995252225519288, + "grad_norm": 0.5476954579353333, + "learning_rate": 1e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7721401453018188, + "num_tokens": 1611378335.0, + "step": 3362 + }, + { + "epoch": 1.995845697329377, + "grad_norm": 0.5300246477127075, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7664183974266052, + "num_tokens": 1611879600.0, + "step": 3363 + }, + { + "epoch": 1.9964391691394658, + "grad_norm": 0.5418618321418762, + "learning_rate": 1e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.7763720154762268, + "num_tokens": 1612363924.0, + "step": 3364 + }, + { + "epoch": 1.997032640949555, + "grad_norm": 0.5608958005905151, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7660882472991943, + "num_tokens": 1612840591.0, + "step": 3365 + }, + { + "epoch": 1.997626112759644, + "grad_norm": 0.5304967761039734, + "learning_rate": 1e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7600520849227905, + "num_tokens": 1613350147.0, + "step": 3366 + }, + { + "epoch": 1.9982195845697328, + "grad_norm": 0.5341174006462097, + "learning_rate": 1e-06, + "loss": 0.6649, + "mean_token_accuracy": 0.785240888595581, + "num_tokens": 1613824809.0, + "step": 3367 + }, + { + "epoch": 1.9988130563798219, + "grad_norm": 0.5443593859672546, + "learning_rate": 1e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7637669444084167, + "num_tokens": 1614297723.0, + "step": 3368 + }, + { + "epoch": 1.999406528189911, + "grad_norm": 0.5185397863388062, + "learning_rate": 1e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7622286677360535, + "num_tokens": 1614807697.0, + "step": 3369 + }, + { + "epoch": 2.0, + "grad_norm": 0.5461151003837585, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7555603981018066, + "num_tokens": 1615284141.0, + "step": 3370 + }, + { + "epoch": 2.000593471810089, + "grad_norm": 0.5618637800216675, + "learning_rate": 1e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7659225463867188, + "num_tokens": 1615757687.0, + "step": 3371 + }, + { + "epoch": 2.001186943620178, + "grad_norm": 0.5686736702919006, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7622595429420471, + "num_tokens": 1616212712.0, + "step": 3372 + }, + { + "epoch": 2.001780415430267, + "grad_norm": 0.5581168532371521, + "learning_rate": 1e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7622480392456055, + "num_tokens": 1616727787.0, + "step": 3373 + }, + { + "epoch": 2.0023738872403563, + "grad_norm": 0.5084033012390137, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7679520845413208, + "num_tokens": 1617268727.0, + "step": 3374 + }, + { + "epoch": 2.0029673590504453, + "grad_norm": 0.5705530643463135, + "learning_rate": 1e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7730144262313843, + "num_tokens": 1617738425.0, + "step": 3375 + }, + { + "epoch": 2.003560830860534, + "grad_norm": 0.551472008228302, + "learning_rate": 1e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7705772519111633, + "num_tokens": 1618199687.0, + "step": 3376 + }, + { + "epoch": 2.004154302670623, + "grad_norm": 0.5470395684242249, + "learning_rate": 1e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.7813692092895508, + "num_tokens": 1618656338.0, + "step": 3377 + }, + { + "epoch": 2.004747774480712, + "grad_norm": 0.5751737356185913, + "learning_rate": 1e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7686240077018738, + "num_tokens": 1619117185.0, + "step": 3378 + }, + { + "epoch": 2.005341246290801, + "grad_norm": 0.5801493525505066, + "learning_rate": 1e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7583914399147034, + "num_tokens": 1619556190.0, + "step": 3379 + }, + { + "epoch": 2.00593471810089, + "grad_norm": 0.5717679262161255, + "learning_rate": 1e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7804804444313049, + "num_tokens": 1620042897.0, + "step": 3380 + }, + { + "epoch": 2.0065281899109793, + "grad_norm": 0.5698089599609375, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7688179612159729, + "num_tokens": 1620526915.0, + "step": 3381 + }, + { + "epoch": 2.0071216617210683, + "grad_norm": 0.5642390251159668, + "learning_rate": 1e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7655454874038696, + "num_tokens": 1621037044.0, + "step": 3382 + }, + { + "epoch": 2.0077151335311574, + "grad_norm": 0.5821595191955566, + "learning_rate": 1e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.7818707823753357, + "num_tokens": 1621487318.0, + "step": 3383 + }, + { + "epoch": 2.0083086053412464, + "grad_norm": 0.5657975673675537, + "learning_rate": 1e-06, + "loss": 0.6682, + "mean_token_accuracy": 0.7850542068481445, + "num_tokens": 1621976063.0, + "step": 3384 + }, + { + "epoch": 2.0089020771513355, + "grad_norm": 0.548637330532074, + "learning_rate": 1e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.772190511226654, + "num_tokens": 1622475035.0, + "step": 3385 + }, + { + "epoch": 2.009495548961424, + "grad_norm": 0.5559076070785522, + "learning_rate": 1e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7644799947738647, + "num_tokens": 1622931070.0, + "step": 3386 + }, + { + "epoch": 2.010089020771513, + "grad_norm": 0.60866379737854, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7632522583007812, + "num_tokens": 1623384600.0, + "step": 3387 + }, + { + "epoch": 2.0106824925816023, + "grad_norm": 0.6153512001037598, + "learning_rate": 1e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7638607025146484, + "num_tokens": 1623832400.0, + "step": 3388 + }, + { + "epoch": 2.0112759643916913, + "grad_norm": 0.56094890832901, + "learning_rate": 1e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.7753843665122986, + "num_tokens": 1624338429.0, + "step": 3389 + }, + { + "epoch": 2.0118694362017804, + "grad_norm": 0.573879599571228, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7626127004623413, + "num_tokens": 1624866921.0, + "step": 3390 + }, + { + "epoch": 2.0124629080118694, + "grad_norm": 0.6478488445281982, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7647037506103516, + "num_tokens": 1625331413.0, + "step": 3391 + }, + { + "epoch": 2.0130563798219585, + "grad_norm": 0.600084125995636, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7559521198272705, + "num_tokens": 1625828460.0, + "step": 3392 + }, + { + "epoch": 2.0136498516320476, + "grad_norm": 0.5531752705574036, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7705729603767395, + "num_tokens": 1626295535.0, + "step": 3393 + }, + { + "epoch": 2.0142433234421366, + "grad_norm": 0.5849127173423767, + "learning_rate": 1e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7649058103561401, + "num_tokens": 1626732896.0, + "step": 3394 + }, + { + "epoch": 2.0148367952522257, + "grad_norm": 0.6026742458343506, + "learning_rate": 1e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.7705249786376953, + "num_tokens": 1627236191.0, + "step": 3395 + }, + { + "epoch": 2.0154302670623148, + "grad_norm": 0.5628815293312073, + "learning_rate": 1e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7694478034973145, + "num_tokens": 1627665156.0, + "step": 3396 + }, + { + "epoch": 2.0160237388724034, + "grad_norm": 0.5473345518112183, + "learning_rate": 1e-06, + "loss": 0.7095, + "mean_token_accuracy": 0.7734875082969666, + "num_tokens": 1628132602.0, + "step": 3397 + }, + { + "epoch": 2.0166172106824924, + "grad_norm": 0.5829830169677734, + "learning_rate": 1e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.7703266143798828, + "num_tokens": 1628571049.0, + "step": 3398 + }, + { + "epoch": 2.0172106824925815, + "grad_norm": 0.6130585670471191, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7643864154815674, + "num_tokens": 1629020169.0, + "step": 3399 + }, + { + "epoch": 2.0178041543026706, + "grad_norm": 0.5297088623046875, + "learning_rate": 1e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7665930986404419, + "num_tokens": 1629507517.0, + "step": 3400 + }, + { + "epoch": 2.0183976261127596, + "grad_norm": 0.5400259494781494, + "learning_rate": 1e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7665640115737915, + "num_tokens": 1630001374.0, + "step": 3401 + }, + { + "epoch": 2.0189910979228487, + "grad_norm": 0.5824767351150513, + "learning_rate": 1e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7768020629882812, + "num_tokens": 1630455294.0, + "step": 3402 + }, + { + "epoch": 2.0195845697329378, + "grad_norm": 0.5389716625213623, + "learning_rate": 1e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7796373963356018, + "num_tokens": 1630952904.0, + "step": 3403 + }, + { + "epoch": 2.020178041543027, + "grad_norm": 0.5384160876274109, + "learning_rate": 1e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7721477150917053, + "num_tokens": 1631448243.0, + "step": 3404 + }, + { + "epoch": 2.020771513353116, + "grad_norm": 0.5281323194503784, + "learning_rate": 1e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7709698677062988, + "num_tokens": 1631919418.0, + "step": 3405 + }, + { + "epoch": 2.021364985163205, + "grad_norm": 0.5586187243461609, + "learning_rate": 1e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.7821450233459473, + "num_tokens": 1632366120.0, + "step": 3406 + }, + { + "epoch": 2.0219584569732936, + "grad_norm": 0.5558237433433533, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.770779013633728, + "num_tokens": 1632849900.0, + "step": 3407 + }, + { + "epoch": 2.0225519287833826, + "grad_norm": 0.5546467900276184, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7639909982681274, + "num_tokens": 1633333161.0, + "step": 3408 + }, + { + "epoch": 2.0231454005934717, + "grad_norm": 0.5513067245483398, + "learning_rate": 1e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7715657353401184, + "num_tokens": 1633834059.0, + "step": 3409 + }, + { + "epoch": 2.0237388724035608, + "grad_norm": 0.5295056104660034, + "learning_rate": 1e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7706298828125, + "num_tokens": 1634325100.0, + "step": 3410 + }, + { + "epoch": 2.02433234421365, + "grad_norm": 0.5302460193634033, + "learning_rate": 1e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7583193182945251, + "num_tokens": 1634790194.0, + "step": 3411 + }, + { + "epoch": 2.024925816023739, + "grad_norm": 0.5262710452079773, + "learning_rate": 1e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.7808275818824768, + "num_tokens": 1635281728.0, + "step": 3412 + }, + { + "epoch": 2.025519287833828, + "grad_norm": 0.5444704294204712, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7621673345565796, + "num_tokens": 1635753679.0, + "step": 3413 + }, + { + "epoch": 2.026112759643917, + "grad_norm": 0.5276353359222412, + "learning_rate": 1e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7801164984703064, + "num_tokens": 1636241349.0, + "step": 3414 + }, + { + "epoch": 2.026706231454006, + "grad_norm": 0.5555098652839661, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7610339522361755, + "num_tokens": 1636719371.0, + "step": 3415 + }, + { + "epoch": 2.027299703264095, + "grad_norm": 0.5455575585365295, + "learning_rate": 1e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7697902321815491, + "num_tokens": 1637186582.0, + "step": 3416 + }, + { + "epoch": 2.0278931750741838, + "grad_norm": 0.5175855755805969, + "learning_rate": 1e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7798950672149658, + "num_tokens": 1637713061.0, + "step": 3417 + }, + { + "epoch": 2.028486646884273, + "grad_norm": 0.5732153654098511, + "learning_rate": 1e-06, + "loss": 0.7701, + "mean_token_accuracy": 0.7582237720489502, + "num_tokens": 1638181952.0, + "step": 3418 + }, + { + "epoch": 2.029080118694362, + "grad_norm": 0.535581648349762, + "learning_rate": 1e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7645176649093628, + "num_tokens": 1638667597.0, + "step": 3419 + }, + { + "epoch": 2.029673590504451, + "grad_norm": 0.5372406244277954, + "learning_rate": 1e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7731235027313232, + "num_tokens": 1639137180.0, + "step": 3420 + }, + { + "epoch": 2.03026706231454, + "grad_norm": 0.5392864346504211, + "learning_rate": 1e-06, + "loss": 0.6811, + "mean_token_accuracy": 0.7813034653663635, + "num_tokens": 1639600454.0, + "step": 3421 + }, + { + "epoch": 2.030860534124629, + "grad_norm": 0.5864229202270508, + "learning_rate": 1e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7662660479545593, + "num_tokens": 1640029151.0, + "step": 3422 + }, + { + "epoch": 2.031454005934718, + "grad_norm": 0.5816478729248047, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7664304971694946, + "num_tokens": 1640481569.0, + "step": 3423 + }, + { + "epoch": 2.032047477744807, + "grad_norm": 0.5596352815628052, + "learning_rate": 1e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7672566175460815, + "num_tokens": 1640924976.0, + "step": 3424 + }, + { + "epoch": 2.0326409495548963, + "grad_norm": 0.5355221033096313, + "learning_rate": 1e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.766345202922821, + "num_tokens": 1641432814.0, + "step": 3425 + }, + { + "epoch": 2.0332344213649853, + "grad_norm": 0.5998988151550293, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7738274335861206, + "num_tokens": 1641902245.0, + "step": 3426 + }, + { + "epoch": 2.0338278931750744, + "grad_norm": 0.5281990170478821, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7737746238708496, + "num_tokens": 1642413074.0, + "step": 3427 + }, + { + "epoch": 2.034421364985163, + "grad_norm": 0.5542996525764465, + "learning_rate": 1e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7658374309539795, + "num_tokens": 1642867564.0, + "step": 3428 + }, + { + "epoch": 2.035014836795252, + "grad_norm": 0.5705429315567017, + "learning_rate": 1e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7749576568603516, + "num_tokens": 1643314596.0, + "step": 3429 + }, + { + "epoch": 2.035608308605341, + "grad_norm": 0.5541284680366516, + "learning_rate": 1e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7720494270324707, + "num_tokens": 1643781112.0, + "step": 3430 + }, + { + "epoch": 2.03620178041543, + "grad_norm": 0.5717093348503113, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7646266222000122, + "num_tokens": 1644207811.0, + "step": 3431 + }, + { + "epoch": 2.0367952522255193, + "grad_norm": 0.589201807975769, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7708687782287598, + "num_tokens": 1644658740.0, + "step": 3432 + }, + { + "epoch": 2.0373887240356083, + "grad_norm": 0.5555853247642517, + "learning_rate": 1e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.778126060962677, + "num_tokens": 1645134944.0, + "step": 3433 + }, + { + "epoch": 2.0379821958456974, + "grad_norm": 0.5477283596992493, + "learning_rate": 1e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7741219997406006, + "num_tokens": 1645628994.0, + "step": 3434 + }, + { + "epoch": 2.0385756676557865, + "grad_norm": 0.5192553997039795, + "learning_rate": 1e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7764137983322144, + "num_tokens": 1646125747.0, + "step": 3435 + }, + { + "epoch": 2.0391691394658755, + "grad_norm": 0.5617357492446899, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7758151292800903, + "num_tokens": 1646647230.0, + "step": 3436 + }, + { + "epoch": 2.0397626112759646, + "grad_norm": 0.5530866980552673, + "learning_rate": 1e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.770186722278595, + "num_tokens": 1647129622.0, + "step": 3437 + }, + { + "epoch": 2.040356083086053, + "grad_norm": 0.5433862209320068, + "learning_rate": 1e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.766640841960907, + "num_tokens": 1647618617.0, + "step": 3438 + }, + { + "epoch": 2.0409495548961423, + "grad_norm": 0.5241606831550598, + "learning_rate": 1e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.773693323135376, + "num_tokens": 1648110607.0, + "step": 3439 + }, + { + "epoch": 2.0415430267062313, + "grad_norm": 0.5812702775001526, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7718484401702881, + "num_tokens": 1648547884.0, + "step": 3440 + }, + { + "epoch": 2.0421364985163204, + "grad_norm": 0.5571643114089966, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7649124264717102, + "num_tokens": 1649001281.0, + "step": 3441 + }, + { + "epoch": 2.0427299703264095, + "grad_norm": 0.5381041169166565, + "learning_rate": 1e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7620459794998169, + "num_tokens": 1649481729.0, + "step": 3442 + }, + { + "epoch": 2.0433234421364985, + "grad_norm": 0.5840572714805603, + "learning_rate": 1e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7695412039756775, + "num_tokens": 1649950578.0, + "step": 3443 + }, + { + "epoch": 2.0439169139465876, + "grad_norm": 0.5522478818893433, + "learning_rate": 1e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.772678554058075, + "num_tokens": 1650422764.0, + "step": 3444 + }, + { + "epoch": 2.0445103857566767, + "grad_norm": 0.5291880369186401, + "learning_rate": 1e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7761494517326355, + "num_tokens": 1650905159.0, + "step": 3445 + }, + { + "epoch": 2.0451038575667657, + "grad_norm": 0.5314337611198425, + "learning_rate": 1e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7652455568313599, + "num_tokens": 1651420995.0, + "step": 3446 + }, + { + "epoch": 2.045697329376855, + "grad_norm": 0.5384930372238159, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7673193216323853, + "num_tokens": 1651888607.0, + "step": 3447 + }, + { + "epoch": 2.0462908011869434, + "grad_norm": 0.5916008353233337, + "learning_rate": 1e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7700802087783813, + "num_tokens": 1652371657.0, + "step": 3448 + }, + { + "epoch": 2.0468842729970325, + "grad_norm": 0.546824038028717, + "learning_rate": 1e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7696895599365234, + "num_tokens": 1652832788.0, + "step": 3449 + }, + { + "epoch": 2.0474777448071215, + "grad_norm": 0.5140694975852966, + "learning_rate": 1e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7794121503829956, + "num_tokens": 1653319722.0, + "step": 3450 + }, + { + "epoch": 2.0480712166172106, + "grad_norm": 0.5609833002090454, + "learning_rate": 1e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.7792508006095886, + "num_tokens": 1653776458.0, + "step": 3451 + }, + { + "epoch": 2.0486646884272997, + "grad_norm": 0.5145366787910461, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.774029016494751, + "num_tokens": 1654270808.0, + "step": 3452 + }, + { + "epoch": 2.0492581602373887, + "grad_norm": 0.5594705939292908, + "learning_rate": 1e-06, + "loss": 0.6793, + "mean_token_accuracy": 0.7811943292617798, + "num_tokens": 1654746794.0, + "step": 3453 + }, + { + "epoch": 2.049851632047478, + "grad_norm": 0.5665714740753174, + "learning_rate": 1e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7759398221969604, + "num_tokens": 1655214249.0, + "step": 3454 + }, + { + "epoch": 2.050445103857567, + "grad_norm": 0.5198785662651062, + "learning_rate": 1e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7724166512489319, + "num_tokens": 1655717026.0, + "step": 3455 + }, + { + "epoch": 2.051038575667656, + "grad_norm": 0.6188716888427734, + "learning_rate": 1e-06, + "loss": 0.6696, + "mean_token_accuracy": 0.7842819690704346, + "num_tokens": 1656134131.0, + "step": 3456 + }, + { + "epoch": 2.051632047477745, + "grad_norm": 0.5259239673614502, + "learning_rate": 1e-06, + "loss": 0.6811, + "mean_token_accuracy": 0.7819886207580566, + "num_tokens": 1656623140.0, + "step": 3457 + }, + { + "epoch": 2.052225519287834, + "grad_norm": 0.5586754679679871, + "learning_rate": 1e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.78077232837677, + "num_tokens": 1657105771.0, + "step": 3458 + }, + { + "epoch": 2.0528189910979227, + "grad_norm": 0.5538620352745056, + "learning_rate": 1e-06, + "loss": 0.6775, + "mean_token_accuracy": 0.7837264537811279, + "num_tokens": 1657567970.0, + "step": 3459 + }, + { + "epoch": 2.0534124629080117, + "grad_norm": 0.5446884632110596, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7725533843040466, + "num_tokens": 1658012687.0, + "step": 3460 + }, + { + "epoch": 2.054005934718101, + "grad_norm": 0.5432481169700623, + "learning_rate": 1e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7638974189758301, + "num_tokens": 1658510709.0, + "step": 3461 + }, + { + "epoch": 2.05459940652819, + "grad_norm": 0.5572868585586548, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7647979855537415, + "num_tokens": 1658959856.0, + "step": 3462 + }, + { + "epoch": 2.055192878338279, + "grad_norm": 0.5604308247566223, + "learning_rate": 1e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7534061670303345, + "num_tokens": 1659403158.0, + "step": 3463 + }, + { + "epoch": 2.055786350148368, + "grad_norm": 0.543530285358429, + "learning_rate": 1e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.7776176929473877, + "num_tokens": 1659871247.0, + "step": 3464 + }, + { + "epoch": 2.056379821958457, + "grad_norm": 0.5350042581558228, + "learning_rate": 1e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7633761167526245, + "num_tokens": 1660360495.0, + "step": 3465 + }, + { + "epoch": 2.056973293768546, + "grad_norm": 0.5430453419685364, + "learning_rate": 1e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.77357017993927, + "num_tokens": 1660826819.0, + "step": 3466 + }, + { + "epoch": 2.057566765578635, + "grad_norm": 0.5444956421852112, + "learning_rate": 1e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7721356153488159, + "num_tokens": 1661288423.0, + "step": 3467 + }, + { + "epoch": 2.0581602373887242, + "grad_norm": 0.5627268552780151, + "learning_rate": 1e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7633346319198608, + "num_tokens": 1661750440.0, + "step": 3468 + }, + { + "epoch": 2.058753709198813, + "grad_norm": 0.5447527170181274, + "learning_rate": 1e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7760410308837891, + "num_tokens": 1662193783.0, + "step": 3469 + }, + { + "epoch": 2.059347181008902, + "grad_norm": 0.5428047180175781, + "learning_rate": 1e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7669515013694763, + "num_tokens": 1662673202.0, + "step": 3470 + }, + { + "epoch": 2.059940652818991, + "grad_norm": 0.5483320355415344, + "learning_rate": 1e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7711178064346313, + "num_tokens": 1663153972.0, + "step": 3471 + }, + { + "epoch": 2.06053412462908, + "grad_norm": 0.5681724548339844, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7658753991127014, + "num_tokens": 1663618702.0, + "step": 3472 + }, + { + "epoch": 2.061127596439169, + "grad_norm": 0.5544432401657104, + "learning_rate": 1e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7769725918769836, + "num_tokens": 1664094240.0, + "step": 3473 + }, + { + "epoch": 2.061721068249258, + "grad_norm": 0.6237349510192871, + "learning_rate": 1e-06, + "loss": 0.7095, + "mean_token_accuracy": 0.7745639085769653, + "num_tokens": 1664504159.0, + "step": 3474 + }, + { + "epoch": 2.0623145400593472, + "grad_norm": 0.5452467203140259, + "learning_rate": 1e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7602550387382507, + "num_tokens": 1664993035.0, + "step": 3475 + }, + { + "epoch": 2.0629080118694363, + "grad_norm": 0.5524291396141052, + "learning_rate": 1e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7696404457092285, + "num_tokens": 1665472420.0, + "step": 3476 + }, + { + "epoch": 2.0635014836795254, + "grad_norm": 0.532714307308197, + "learning_rate": 1e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7673034071922302, + "num_tokens": 1665953749.0, + "step": 3477 + }, + { + "epoch": 2.0640949554896144, + "grad_norm": 0.5329421162605286, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7651623487472534, + "num_tokens": 1666498060.0, + "step": 3478 + }, + { + "epoch": 2.0646884272997035, + "grad_norm": 0.5518712997436523, + "learning_rate": 1e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7779169082641602, + "num_tokens": 1666976035.0, + "step": 3479 + }, + { + "epoch": 2.065281899109792, + "grad_norm": 0.5474432706832886, + "learning_rate": 1e-06, + "loss": 0.6842, + "mean_token_accuracy": 0.7808053493499756, + "num_tokens": 1667481713.0, + "step": 3480 + }, + { + "epoch": 2.065875370919881, + "grad_norm": 0.524366021156311, + "learning_rate": 1e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7713073492050171, + "num_tokens": 1668012561.0, + "step": 3481 + }, + { + "epoch": 2.0664688427299702, + "grad_norm": 0.5374754071235657, + "learning_rate": 1e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7724282145500183, + "num_tokens": 1668479380.0, + "step": 3482 + }, + { + "epoch": 2.0670623145400593, + "grad_norm": 0.517825186252594, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.766416072845459, + "num_tokens": 1669018915.0, + "step": 3483 + }, + { + "epoch": 2.0676557863501484, + "grad_norm": 0.5118839740753174, + "learning_rate": 1e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7794189453125, + "num_tokens": 1669544134.0, + "step": 3484 + }, + { + "epoch": 2.0682492581602374, + "grad_norm": 0.5534036159515381, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7634836435317993, + "num_tokens": 1670013396.0, + "step": 3485 + }, + { + "epoch": 2.0688427299703265, + "grad_norm": 0.5652926564216614, + "learning_rate": 1e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7717620134353638, + "num_tokens": 1670486154.0, + "step": 3486 + }, + { + "epoch": 2.0694362017804155, + "grad_norm": 0.593547523021698, + "learning_rate": 1e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7726178169250488, + "num_tokens": 1670941109.0, + "step": 3487 + }, + { + "epoch": 2.0700296735905046, + "grad_norm": 0.5001094937324524, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7735616564750671, + "num_tokens": 1671475233.0, + "step": 3488 + }, + { + "epoch": 2.0706231454005937, + "grad_norm": 0.5691470503807068, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7587747573852539, + "num_tokens": 1671989683.0, + "step": 3489 + }, + { + "epoch": 2.0712166172106823, + "grad_norm": 0.5546805262565613, + "learning_rate": 1e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7748262882232666, + "num_tokens": 1672508107.0, + "step": 3490 + }, + { + "epoch": 2.0718100890207714, + "grad_norm": 0.5901192426681519, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7681192755699158, + "num_tokens": 1672986292.0, + "step": 3491 + }, + { + "epoch": 2.0724035608308604, + "grad_norm": 0.5587781071662903, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7658510208129883, + "num_tokens": 1673497999.0, + "step": 3492 + }, + { + "epoch": 2.0729970326409495, + "grad_norm": 0.5379320979118347, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7660892605781555, + "num_tokens": 1673983073.0, + "step": 3493 + }, + { + "epoch": 2.0735905044510385, + "grad_norm": 0.526787281036377, + "learning_rate": 1e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7712929248809814, + "num_tokens": 1674474894.0, + "step": 3494 + }, + { + "epoch": 2.0741839762611276, + "grad_norm": 0.55057692527771, + "learning_rate": 1e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7724474668502808, + "num_tokens": 1674976012.0, + "step": 3495 + }, + { + "epoch": 2.0747774480712167, + "grad_norm": 0.5982988476753235, + "learning_rate": 1e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7689944505691528, + "num_tokens": 1675465470.0, + "step": 3496 + }, + { + "epoch": 2.0753709198813057, + "grad_norm": 0.5672333836555481, + "learning_rate": 1e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7701399326324463, + "num_tokens": 1675943350.0, + "step": 3497 + }, + { + "epoch": 2.075964391691395, + "grad_norm": 0.5239322781562805, + "learning_rate": 1e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.7805285453796387, + "num_tokens": 1676459246.0, + "step": 3498 + }, + { + "epoch": 2.076557863501484, + "grad_norm": 0.6033767461776733, + "learning_rate": 1e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7650313377380371, + "num_tokens": 1676929352.0, + "step": 3499 + }, + { + "epoch": 2.077151335311573, + "grad_norm": 0.5465962290763855, + "learning_rate": 1e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7711745500564575, + "num_tokens": 1677409039.0, + "step": 3500 + }, + { + "epoch": 2.0777448071216615, + "grad_norm": 0.5520946383476257, + "learning_rate": 1e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7724215984344482, + "num_tokens": 1677872754.0, + "step": 3501 + }, + { + "epoch": 2.0783382789317506, + "grad_norm": 0.5354759097099304, + "learning_rate": 1e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.772165834903717, + "num_tokens": 1678377064.0, + "step": 3502 + }, + { + "epoch": 2.0789317507418397, + "grad_norm": 0.5710083842277527, + "learning_rate": 1e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.7791554927825928, + "num_tokens": 1678811892.0, + "step": 3503 + }, + { + "epoch": 2.0795252225519287, + "grad_norm": 0.5930449962615967, + "learning_rate": 1e-06, + "loss": 0.6661, + "mean_token_accuracy": 0.7856810092926025, + "num_tokens": 1679282360.0, + "step": 3504 + }, + { + "epoch": 2.080118694362018, + "grad_norm": 0.5632746815681458, + "learning_rate": 1e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.776945948600769, + "num_tokens": 1679737628.0, + "step": 3505 + }, + { + "epoch": 2.080712166172107, + "grad_norm": 0.5689655542373657, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7670688033103943, + "num_tokens": 1680190017.0, + "step": 3506 + }, + { + "epoch": 2.081305637982196, + "grad_norm": 0.548366367816925, + "learning_rate": 1e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7771053910255432, + "num_tokens": 1680658728.0, + "step": 3507 + }, + { + "epoch": 2.081899109792285, + "grad_norm": 0.567262589931488, + "learning_rate": 1e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.778302788734436, + "num_tokens": 1681111339.0, + "step": 3508 + }, + { + "epoch": 2.082492581602374, + "grad_norm": 0.5764176249504089, + "learning_rate": 1e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.7669987678527832, + "num_tokens": 1681558079.0, + "step": 3509 + }, + { + "epoch": 2.083086053412463, + "grad_norm": 0.5157011151313782, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7662045955657959, + "num_tokens": 1682072528.0, + "step": 3510 + }, + { + "epoch": 2.0836795252225517, + "grad_norm": 0.5521277189254761, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7700607776641846, + "num_tokens": 1682550562.0, + "step": 3511 + }, + { + "epoch": 2.084272997032641, + "grad_norm": 0.5579573512077332, + "learning_rate": 1e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7752211093902588, + "num_tokens": 1683020624.0, + "step": 3512 + }, + { + "epoch": 2.08486646884273, + "grad_norm": 0.5526149868965149, + "learning_rate": 1e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7661439776420593, + "num_tokens": 1683504241.0, + "step": 3513 + }, + { + "epoch": 2.085459940652819, + "grad_norm": 0.5468506813049316, + "learning_rate": 1e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.7757675647735596, + "num_tokens": 1683977964.0, + "step": 3514 + }, + { + "epoch": 2.086053412462908, + "grad_norm": 0.5473606586456299, + "learning_rate": 1e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7655478715896606, + "num_tokens": 1684470891.0, + "step": 3515 + }, + { + "epoch": 2.086646884272997, + "grad_norm": 0.5749850869178772, + "learning_rate": 1e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7758615016937256, + "num_tokens": 1684910243.0, + "step": 3516 + }, + { + "epoch": 2.087240356083086, + "grad_norm": 0.5939862728118896, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7693228721618652, + "num_tokens": 1685334933.0, + "step": 3517 + }, + { + "epoch": 2.087833827893175, + "grad_norm": 0.5333378314971924, + "learning_rate": 1e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7776707410812378, + "num_tokens": 1685811406.0, + "step": 3518 + }, + { + "epoch": 2.0884272997032642, + "grad_norm": 0.5850166082382202, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7619590759277344, + "num_tokens": 1686267101.0, + "step": 3519 + }, + { + "epoch": 2.0890207715133533, + "grad_norm": 0.5791059136390686, + "learning_rate": 1e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.773227334022522, + "num_tokens": 1686738572.0, + "step": 3520 + }, + { + "epoch": 2.089614243323442, + "grad_norm": 0.5583986639976501, + "learning_rate": 1e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.7869197726249695, + "num_tokens": 1687239865.0, + "step": 3521 + }, + { + "epoch": 2.090207715133531, + "grad_norm": 0.5800155401229858, + "learning_rate": 1e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7704079747200012, + "num_tokens": 1687702115.0, + "step": 3522 + }, + { + "epoch": 2.09080118694362, + "grad_norm": 0.571467936038971, + "learning_rate": 1e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7504817843437195, + "num_tokens": 1688189099.0, + "step": 3523 + }, + { + "epoch": 2.091394658753709, + "grad_norm": 0.6065407991409302, + "learning_rate": 1e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7614864110946655, + "num_tokens": 1688661968.0, + "step": 3524 + }, + { + "epoch": 2.091988130563798, + "grad_norm": 0.5413991212844849, + "learning_rate": 1e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7854573726654053, + "num_tokens": 1689178049.0, + "step": 3525 + }, + { + "epoch": 2.0925816023738872, + "grad_norm": 0.5469536781311035, + "learning_rate": 1e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.7840051651000977, + "num_tokens": 1689704411.0, + "step": 3526 + }, + { + "epoch": 2.0931750741839763, + "grad_norm": 0.5634246468544006, + "learning_rate": 1e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.7761006951332092, + "num_tokens": 1690153364.0, + "step": 3527 + }, + { + "epoch": 2.0937685459940654, + "grad_norm": 0.5519788265228271, + "learning_rate": 1e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7655926942825317, + "num_tokens": 1690619140.0, + "step": 3528 + }, + { + "epoch": 2.0943620178041544, + "grad_norm": 0.5441221594810486, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7605458498001099, + "num_tokens": 1691128818.0, + "step": 3529 + }, + { + "epoch": 2.0949554896142435, + "grad_norm": 0.5460206270217896, + "learning_rate": 1e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7721248865127563, + "num_tokens": 1691639289.0, + "step": 3530 + }, + { + "epoch": 2.0955489614243326, + "grad_norm": 0.5526307225227356, + "learning_rate": 1e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.773240327835083, + "num_tokens": 1692138190.0, + "step": 3531 + }, + { + "epoch": 2.096142433234421, + "grad_norm": 0.5832743048667908, + "learning_rate": 1e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7703183889389038, + "num_tokens": 1692644914.0, + "step": 3532 + }, + { + "epoch": 2.0967359050445102, + "grad_norm": 0.5649511218070984, + "learning_rate": 1e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7644126415252686, + "num_tokens": 1693109870.0, + "step": 3533 + }, + { + "epoch": 2.0973293768545993, + "grad_norm": 0.5369591116905212, + "learning_rate": 1e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7690600752830505, + "num_tokens": 1693608667.0, + "step": 3534 + }, + { + "epoch": 2.0979228486646884, + "grad_norm": 0.5147103071212769, + "learning_rate": 1e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7732354402542114, + "num_tokens": 1694130854.0, + "step": 3535 + }, + { + "epoch": 2.0985163204747774, + "grad_norm": 0.5855104327201843, + "learning_rate": 1e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7661241292953491, + "num_tokens": 1694585432.0, + "step": 3536 + }, + { + "epoch": 2.0991097922848665, + "grad_norm": 0.5389553904533386, + "learning_rate": 1e-06, + "loss": 0.6596, + "mean_token_accuracy": 0.7843605875968933, + "num_tokens": 1695065101.0, + "step": 3537 + }, + { + "epoch": 2.0997032640949556, + "grad_norm": 0.5477596521377563, + "learning_rate": 1e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7620164752006531, + "num_tokens": 1695534831.0, + "step": 3538 + }, + { + "epoch": 2.1002967359050446, + "grad_norm": 0.5221799612045288, + "learning_rate": 1e-06, + "loss": 0.6717, + "mean_token_accuracy": 0.7853305339813232, + "num_tokens": 1696017447.0, + "step": 3539 + }, + { + "epoch": 2.1008902077151337, + "grad_norm": 0.5351828932762146, + "learning_rate": 1e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7651580572128296, + "num_tokens": 1696503495.0, + "step": 3540 + }, + { + "epoch": 2.1014836795252227, + "grad_norm": 0.5348999500274658, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7759232521057129, + "num_tokens": 1697013801.0, + "step": 3541 + }, + { + "epoch": 2.1020771513353114, + "grad_norm": 0.5623833537101746, + "learning_rate": 1e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7664308547973633, + "num_tokens": 1697464640.0, + "step": 3542 + }, + { + "epoch": 2.1026706231454004, + "grad_norm": 0.528038740158081, + "learning_rate": 1e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7584048509597778, + "num_tokens": 1697935269.0, + "step": 3543 + }, + { + "epoch": 2.1032640949554895, + "grad_norm": 0.5688831806182861, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.755759596824646, + "num_tokens": 1698409077.0, + "step": 3544 + }, + { + "epoch": 2.1038575667655786, + "grad_norm": 0.5857453346252441, + "learning_rate": 1e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7785269618034363, + "num_tokens": 1698823283.0, + "step": 3545 + }, + { + "epoch": 2.1044510385756676, + "grad_norm": 0.5633919835090637, + "learning_rate": 1e-06, + "loss": 0.6926, + "mean_token_accuracy": 0.7774574756622314, + "num_tokens": 1699318158.0, + "step": 3546 + }, + { + "epoch": 2.1050445103857567, + "grad_norm": 0.5768100619316101, + "learning_rate": 1e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.7619747519493103, + "num_tokens": 1699773125.0, + "step": 3547 + }, + { + "epoch": 2.1056379821958457, + "grad_norm": 0.5295242667198181, + "learning_rate": 1e-06, + "loss": 0.649, + "mean_token_accuracy": 0.7899291515350342, + "num_tokens": 1700245276.0, + "step": 3548 + }, + { + "epoch": 2.106231454005935, + "grad_norm": 0.5967516303062439, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7497682571411133, + "num_tokens": 1700706118.0, + "step": 3549 + }, + { + "epoch": 2.106824925816024, + "grad_norm": 0.5749786496162415, + "learning_rate": 1e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7718856930732727, + "num_tokens": 1701161863.0, + "step": 3550 + }, + { + "epoch": 2.107418397626113, + "grad_norm": 0.549137532711029, + "learning_rate": 1e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7542670965194702, + "num_tokens": 1701607067.0, + "step": 3551 + }, + { + "epoch": 2.1080118694362016, + "grad_norm": 0.59815514087677, + "learning_rate": 1e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7711844444274902, + "num_tokens": 1702063676.0, + "step": 3552 + }, + { + "epoch": 2.1086053412462906, + "grad_norm": 0.5487157106399536, + "learning_rate": 1e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7554696202278137, + "num_tokens": 1702542037.0, + "step": 3553 + }, + { + "epoch": 2.1091988130563797, + "grad_norm": 0.5261229276657104, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7694745063781738, + "num_tokens": 1703055207.0, + "step": 3554 + }, + { + "epoch": 2.1097922848664687, + "grad_norm": 0.5464497208595276, + "learning_rate": 1e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7626314759254456, + "num_tokens": 1703562853.0, + "step": 3555 + }, + { + "epoch": 2.110385756676558, + "grad_norm": 0.6804751753807068, + "learning_rate": 1e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.7792057991027832, + "num_tokens": 1704066586.0, + "step": 3556 + }, + { + "epoch": 2.110979228486647, + "grad_norm": 0.5673225522041321, + "learning_rate": 1e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7812117338180542, + "num_tokens": 1704549346.0, + "step": 3557 + }, + { + "epoch": 2.111572700296736, + "grad_norm": 0.5408616662025452, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7637031078338623, + "num_tokens": 1705046114.0, + "step": 3558 + }, + { + "epoch": 2.112166172106825, + "grad_norm": 0.5186946988105774, + "learning_rate": 1e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7703529596328735, + "num_tokens": 1705553029.0, + "step": 3559 + }, + { + "epoch": 2.112759643916914, + "grad_norm": 0.5476016402244568, + "learning_rate": 1e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.7817575931549072, + "num_tokens": 1706028082.0, + "step": 3560 + }, + { + "epoch": 2.113353115727003, + "grad_norm": 0.5354681015014648, + "learning_rate": 1e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7733557224273682, + "num_tokens": 1706514442.0, + "step": 3561 + }, + { + "epoch": 2.113946587537092, + "grad_norm": 0.5421843528747559, + "learning_rate": 1e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7653418779373169, + "num_tokens": 1706984483.0, + "step": 3562 + }, + { + "epoch": 2.114540059347181, + "grad_norm": 0.5696046948432922, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7699600458145142, + "num_tokens": 1707472457.0, + "step": 3563 + }, + { + "epoch": 2.11513353115727, + "grad_norm": 0.565081000328064, + "learning_rate": 1e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7798737287521362, + "num_tokens": 1707932873.0, + "step": 3564 + }, + { + "epoch": 2.115727002967359, + "grad_norm": 0.5140970349311829, + "learning_rate": 1e-06, + "loss": 0.6698, + "mean_token_accuracy": 0.7838587164878845, + "num_tokens": 1708441466.0, + "step": 3565 + }, + { + "epoch": 2.116320474777448, + "grad_norm": 0.5455077290534973, + "learning_rate": 1e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7725828886032104, + "num_tokens": 1708922967.0, + "step": 3566 + }, + { + "epoch": 2.116913946587537, + "grad_norm": 0.5508245229721069, + "learning_rate": 1e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7743484973907471, + "num_tokens": 1709423937.0, + "step": 3567 + }, + { + "epoch": 2.117507418397626, + "grad_norm": 0.5306257009506226, + "learning_rate": 1e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7775634527206421, + "num_tokens": 1709888008.0, + "step": 3568 + }, + { + "epoch": 2.118100890207715, + "grad_norm": 0.5182341933250427, + "learning_rate": 1e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.7845609188079834, + "num_tokens": 1710401838.0, + "step": 3569 + }, + { + "epoch": 2.1186943620178043, + "grad_norm": 0.5592929720878601, + "learning_rate": 1e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7570971250534058, + "num_tokens": 1710875988.0, + "step": 3570 + }, + { + "epoch": 2.1192878338278933, + "grad_norm": 0.5834131240844727, + "learning_rate": 1e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.768468976020813, + "num_tokens": 1711349677.0, + "step": 3571 + }, + { + "epoch": 2.1198813056379824, + "grad_norm": 0.5357608199119568, + "learning_rate": 1e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7663289904594421, + "num_tokens": 1711854786.0, + "step": 3572 + }, + { + "epoch": 2.120474777448071, + "grad_norm": 0.5829777121543884, + "learning_rate": 1e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.7585767507553101, + "num_tokens": 1712297442.0, + "step": 3573 + }, + { + "epoch": 2.12106824925816, + "grad_norm": 0.53373783826828, + "learning_rate": 1e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.772110104560852, + "num_tokens": 1712801944.0, + "step": 3574 + }, + { + "epoch": 2.121661721068249, + "grad_norm": 0.5639944672584534, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7681090831756592, + "num_tokens": 1713289753.0, + "step": 3575 + }, + { + "epoch": 2.122255192878338, + "grad_norm": 0.5511237382888794, + "learning_rate": 1e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7863823771476746, + "num_tokens": 1713730375.0, + "step": 3576 + }, + { + "epoch": 2.1228486646884273, + "grad_norm": 0.524765133857727, + "learning_rate": 1e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7570904493331909, + "num_tokens": 1714263306.0, + "step": 3577 + }, + { + "epoch": 2.1234421364985163, + "grad_norm": 0.5536831021308899, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7705607414245605, + "num_tokens": 1714740288.0, + "step": 3578 + }, + { + "epoch": 2.1240356083086054, + "grad_norm": 0.5422580242156982, + "learning_rate": 1e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7674480080604553, + "num_tokens": 1715239986.0, + "step": 3579 + }, + { + "epoch": 2.1246290801186944, + "grad_norm": 0.5607481002807617, + "learning_rate": 1e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7669041156768799, + "num_tokens": 1715725411.0, + "step": 3580 + }, + { + "epoch": 2.1252225519287835, + "grad_norm": 0.5546451807022095, + "learning_rate": 1e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7652099132537842, + "num_tokens": 1716205898.0, + "step": 3581 + }, + { + "epoch": 2.1258160237388726, + "grad_norm": 0.5655434131622314, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7631528377532959, + "num_tokens": 1716659410.0, + "step": 3582 + }, + { + "epoch": 2.1264094955489616, + "grad_norm": 0.5744324922561646, + "learning_rate": 1e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.7826844453811646, + "num_tokens": 1717151585.0, + "step": 3583 + }, + { + "epoch": 2.1270029673590503, + "grad_norm": 0.5370541214942932, + "learning_rate": 1e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7772486805915833, + "num_tokens": 1717640562.0, + "step": 3584 + }, + { + "epoch": 2.1275964391691393, + "grad_norm": 0.5517163276672363, + "learning_rate": 1e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7701400518417358, + "num_tokens": 1718129479.0, + "step": 3585 + }, + { + "epoch": 2.1281899109792284, + "grad_norm": 0.5204896926879883, + "learning_rate": 1e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7737157344818115, + "num_tokens": 1718617054.0, + "step": 3586 + }, + { + "epoch": 2.1287833827893174, + "grad_norm": 0.5316011905670166, + "learning_rate": 1e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7730283737182617, + "num_tokens": 1719105903.0, + "step": 3587 + }, + { + "epoch": 2.1293768545994065, + "grad_norm": 0.5630130767822266, + "learning_rate": 1e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.771263062953949, + "num_tokens": 1719586638.0, + "step": 3588 + }, + { + "epoch": 2.1299703264094956, + "grad_norm": 0.5187419056892395, + "learning_rate": 1e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.7757992744445801, + "num_tokens": 1720098136.0, + "step": 3589 + }, + { + "epoch": 2.1305637982195846, + "grad_norm": 0.5759384632110596, + "learning_rate": 1e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.7701282501220703, + "num_tokens": 1720551064.0, + "step": 3590 + }, + { + "epoch": 2.1311572700296737, + "grad_norm": 0.5447547435760498, + "learning_rate": 1e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7758841514587402, + "num_tokens": 1721044805.0, + "step": 3591 + }, + { + "epoch": 2.1317507418397628, + "grad_norm": 0.5198243260383606, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7588722705841064, + "num_tokens": 1721575637.0, + "step": 3592 + }, + { + "epoch": 2.132344213649852, + "grad_norm": 0.5140870213508606, + "learning_rate": 1e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7731410264968872, + "num_tokens": 1722106563.0, + "step": 3593 + }, + { + "epoch": 2.1329376854599404, + "grad_norm": 0.5187997817993164, + "learning_rate": 1e-06, + "loss": 0.6751, + "mean_token_accuracy": 0.782091498374939, + "num_tokens": 1722613651.0, + "step": 3594 + }, + { + "epoch": 2.1335311572700295, + "grad_norm": 0.5638911724090576, + "learning_rate": 1e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7661904096603394, + "num_tokens": 1723067071.0, + "step": 3595 + }, + { + "epoch": 2.1341246290801186, + "grad_norm": 0.5776990652084351, + "learning_rate": 1e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7720823884010315, + "num_tokens": 1723488891.0, + "step": 3596 + }, + { + "epoch": 2.1347181008902076, + "grad_norm": 0.5696113109588623, + "learning_rate": 1e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7819621562957764, + "num_tokens": 1723909371.0, + "step": 3597 + }, + { + "epoch": 2.1353115727002967, + "grad_norm": 0.530806839466095, + "learning_rate": 1e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7702655792236328, + "num_tokens": 1724420248.0, + "step": 3598 + }, + { + "epoch": 2.1359050445103858, + "grad_norm": 0.5613597631454468, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7655052542686462, + "num_tokens": 1724868678.0, + "step": 3599 + }, + { + "epoch": 2.136498516320475, + "grad_norm": 0.5394376516342163, + "learning_rate": 1e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7676010131835938, + "num_tokens": 1725379040.0, + "step": 3600 + }, + { + "epoch": 2.137091988130564, + "grad_norm": 0.5631048083305359, + "learning_rate": 1e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.769194483757019, + "num_tokens": 1725844548.0, + "step": 3601 + }, + { + "epoch": 2.137685459940653, + "grad_norm": 0.5415511727333069, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7709943056106567, + "num_tokens": 1726326756.0, + "step": 3602 + }, + { + "epoch": 2.138278931750742, + "grad_norm": 0.5664710402488708, + "learning_rate": 1e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7715948820114136, + "num_tokens": 1726797242.0, + "step": 3603 + }, + { + "epoch": 2.138872403560831, + "grad_norm": 0.5698924660682678, + "learning_rate": 1e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7790921330451965, + "num_tokens": 1727272501.0, + "step": 3604 + }, + { + "epoch": 2.1394658753709197, + "grad_norm": 0.5563272833824158, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7606862783432007, + "num_tokens": 1727756588.0, + "step": 3605 + }, + { + "epoch": 2.1400593471810088, + "grad_norm": 0.5611901879310608, + "learning_rate": 1e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7733136415481567, + "num_tokens": 1728226907.0, + "step": 3606 + }, + { + "epoch": 2.140652818991098, + "grad_norm": 0.5808849930763245, + "learning_rate": 1e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.7761290073394775, + "num_tokens": 1728681053.0, + "step": 3607 + }, + { + "epoch": 2.141246290801187, + "grad_norm": 0.6072643399238586, + "learning_rate": 1e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7724664211273193, + "num_tokens": 1729112631.0, + "step": 3608 + }, + { + "epoch": 2.141839762611276, + "grad_norm": 0.5769797563552856, + "learning_rate": 1e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.7837411165237427, + "num_tokens": 1729579212.0, + "step": 3609 + }, + { + "epoch": 2.142433234421365, + "grad_norm": 0.5953533053398132, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7709968090057373, + "num_tokens": 1730016830.0, + "step": 3610 + }, + { + "epoch": 2.143026706231454, + "grad_norm": 0.5573709011077881, + "learning_rate": 1e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7644574642181396, + "num_tokens": 1730500944.0, + "step": 3611 + }, + { + "epoch": 2.143620178041543, + "grad_norm": 0.5590368509292603, + "learning_rate": 1e-06, + "loss": 0.695, + "mean_token_accuracy": 0.7766975164413452, + "num_tokens": 1731002847.0, + "step": 3612 + }, + { + "epoch": 2.144213649851632, + "grad_norm": 0.546676754951477, + "learning_rate": 1e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7653051018714905, + "num_tokens": 1731481306.0, + "step": 3613 + }, + { + "epoch": 2.1448071216617213, + "grad_norm": 0.5568475723266602, + "learning_rate": 1e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7775466442108154, + "num_tokens": 1731945426.0, + "step": 3614 + }, + { + "epoch": 2.14540059347181, + "grad_norm": 0.5681318640708923, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7627587914466858, + "num_tokens": 1732402478.0, + "step": 3615 + }, + { + "epoch": 2.145994065281899, + "grad_norm": 0.551016092300415, + "learning_rate": 1e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7603077292442322, + "num_tokens": 1732880665.0, + "step": 3616 + }, + { + "epoch": 2.146587537091988, + "grad_norm": 0.5758184790611267, + "learning_rate": 1e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.75641268491745, + "num_tokens": 1733325408.0, + "step": 3617 + }, + { + "epoch": 2.147181008902077, + "grad_norm": 0.5438353419303894, + "learning_rate": 1e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7800137400627136, + "num_tokens": 1733812867.0, + "step": 3618 + }, + { + "epoch": 2.147774480712166, + "grad_norm": 0.5327871441841125, + "learning_rate": 1e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7693705558776855, + "num_tokens": 1734322652.0, + "step": 3619 + }, + { + "epoch": 2.148367952522255, + "grad_norm": 0.5611987113952637, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7620667219161987, + "num_tokens": 1734764443.0, + "step": 3620 + }, + { + "epoch": 2.1489614243323443, + "grad_norm": 0.54334557056427, + "learning_rate": 1e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7689230442047119, + "num_tokens": 1735246819.0, + "step": 3621 + }, + { + "epoch": 2.1495548961424333, + "grad_norm": 0.5530200600624084, + "learning_rate": 1e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7741588354110718, + "num_tokens": 1735728907.0, + "step": 3622 + }, + { + "epoch": 2.1501483679525224, + "grad_norm": 0.5350744724273682, + "learning_rate": 1e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.782805860042572, + "num_tokens": 1736242251.0, + "step": 3623 + }, + { + "epoch": 2.1507418397626115, + "grad_norm": 0.5526752471923828, + "learning_rate": 1e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7759155631065369, + "num_tokens": 1736676598.0, + "step": 3624 + }, + { + "epoch": 2.1513353115727005, + "grad_norm": 0.5641459822654724, + "learning_rate": 1e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7688339948654175, + "num_tokens": 1737164601.0, + "step": 3625 + }, + { + "epoch": 2.151928783382789, + "grad_norm": 0.5463383197784424, + "learning_rate": 1e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.7766330242156982, + "num_tokens": 1737697258.0, + "step": 3626 + }, + { + "epoch": 2.152522255192878, + "grad_norm": 0.5047842264175415, + "learning_rate": 1e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7704821825027466, + "num_tokens": 1738231253.0, + "step": 3627 + }, + { + "epoch": 2.1531157270029673, + "grad_norm": 0.5366432070732117, + "learning_rate": 1e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7635931968688965, + "num_tokens": 1738735538.0, + "step": 3628 + }, + { + "epoch": 2.1537091988130563, + "grad_norm": 0.5483989715576172, + "learning_rate": 1e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7617409229278564, + "num_tokens": 1739236820.0, + "step": 3629 + }, + { + "epoch": 2.1543026706231454, + "grad_norm": 0.5537580847740173, + "learning_rate": 1e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.7765738368034363, + "num_tokens": 1739694025.0, + "step": 3630 + }, + { + "epoch": 2.1548961424332345, + "grad_norm": 0.5630735754966736, + "learning_rate": 1e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7659626007080078, + "num_tokens": 1740183072.0, + "step": 3631 + }, + { + "epoch": 2.1554896142433235, + "grad_norm": 0.5518720746040344, + "learning_rate": 1e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.7811968326568604, + "num_tokens": 1740643439.0, + "step": 3632 + }, + { + "epoch": 2.1560830860534126, + "grad_norm": 0.5584912896156311, + "learning_rate": 1e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.7683544158935547, + "num_tokens": 1741146934.0, + "step": 3633 + }, + { + "epoch": 2.1566765578635017, + "grad_norm": 0.5667561292648315, + "learning_rate": 1e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7702509164810181, + "num_tokens": 1741644608.0, + "step": 3634 + }, + { + "epoch": 2.1572700296735903, + "grad_norm": 0.5487796068191528, + "learning_rate": 1e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7718130946159363, + "num_tokens": 1742162059.0, + "step": 3635 + }, + { + "epoch": 2.1578635014836793, + "grad_norm": 0.5721859335899353, + "learning_rate": 1e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.772408127784729, + "num_tokens": 1742617163.0, + "step": 3636 + }, + { + "epoch": 2.1584569732937684, + "grad_norm": 0.5284261703491211, + "learning_rate": 1e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7771041393280029, + "num_tokens": 1743118738.0, + "step": 3637 + }, + { + "epoch": 2.1590504451038575, + "grad_norm": 0.5603710412979126, + "learning_rate": 1e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7768998146057129, + "num_tokens": 1743601327.0, + "step": 3638 + }, + { + "epoch": 2.1596439169139465, + "grad_norm": 0.552489161491394, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7685710787773132, + "num_tokens": 1744088578.0, + "step": 3639 + }, + { + "epoch": 2.1602373887240356, + "grad_norm": 0.5288505554199219, + "learning_rate": 1e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7704088687896729, + "num_tokens": 1744580201.0, + "step": 3640 + }, + { + "epoch": 2.1608308605341247, + "grad_norm": 0.5677116513252258, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7613558769226074, + "num_tokens": 1745027094.0, + "step": 3641 + }, + { + "epoch": 2.1614243323442137, + "grad_norm": 0.5649921894073486, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7745864391326904, + "num_tokens": 1745485288.0, + "step": 3642 + }, + { + "epoch": 2.162017804154303, + "grad_norm": 0.5384460091590881, + "learning_rate": 1e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7800443172454834, + "num_tokens": 1745969046.0, + "step": 3643 + }, + { + "epoch": 2.162611275964392, + "grad_norm": 0.5357629060745239, + "learning_rate": 1e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7698798179626465, + "num_tokens": 1746489109.0, + "step": 3644 + }, + { + "epoch": 2.163204747774481, + "grad_norm": 0.5654807090759277, + "learning_rate": 1e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.7817627191543579, + "num_tokens": 1746930136.0, + "step": 3645 + }, + { + "epoch": 2.1637982195845695, + "grad_norm": 0.5370548367500305, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7667884826660156, + "num_tokens": 1747425525.0, + "step": 3646 + }, + { + "epoch": 2.1643916913946586, + "grad_norm": 0.5490816235542297, + "learning_rate": 1e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.7769260406494141, + "num_tokens": 1747925694.0, + "step": 3647 + }, + { + "epoch": 2.1649851632047477, + "grad_norm": 0.5737757682800293, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7519556879997253, + "num_tokens": 1748383494.0, + "step": 3648 + }, + { + "epoch": 2.1655786350148367, + "grad_norm": 0.5572394132614136, + "learning_rate": 1e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7746952772140503, + "num_tokens": 1748857440.0, + "step": 3649 + }, + { + "epoch": 2.166172106824926, + "grad_norm": 0.5566679239273071, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7722698450088501, + "num_tokens": 1749370399.0, + "step": 3650 + }, + { + "epoch": 2.166765578635015, + "grad_norm": 0.5509106516838074, + "learning_rate": 1e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7653419375419617, + "num_tokens": 1749851399.0, + "step": 3651 + }, + { + "epoch": 2.167359050445104, + "grad_norm": 0.5904065370559692, + "learning_rate": 1e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7483130693435669, + "num_tokens": 1750295089.0, + "step": 3652 + }, + { + "epoch": 2.167952522255193, + "grad_norm": 0.5416967868804932, + "learning_rate": 1e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7681665420532227, + "num_tokens": 1750776735.0, + "step": 3653 + }, + { + "epoch": 2.168545994065282, + "grad_norm": 0.5545541644096375, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7738929390907288, + "num_tokens": 1751248432.0, + "step": 3654 + }, + { + "epoch": 2.169139465875371, + "grad_norm": 0.583490252494812, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7601470947265625, + "num_tokens": 1751711425.0, + "step": 3655 + }, + { + "epoch": 2.1697329376854597, + "grad_norm": 0.5311751365661621, + "learning_rate": 1e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.776369035243988, + "num_tokens": 1752233010.0, + "step": 3656 + }, + { + "epoch": 2.170326409495549, + "grad_norm": 0.5414220690727234, + "learning_rate": 1e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.7783467769622803, + "num_tokens": 1752717530.0, + "step": 3657 + }, + { + "epoch": 2.170919881305638, + "grad_norm": 0.5542644262313843, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7585515975952148, + "num_tokens": 1753173709.0, + "step": 3658 + }, + { + "epoch": 2.171513353115727, + "grad_norm": 0.5602869391441345, + "learning_rate": 1e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7693324685096741, + "num_tokens": 1753645308.0, + "step": 3659 + }, + { + "epoch": 2.172106824925816, + "grad_norm": 0.5358572602272034, + "learning_rate": 1e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7666014432907104, + "num_tokens": 1754165881.0, + "step": 3660 + }, + { + "epoch": 2.172700296735905, + "grad_norm": 0.5066994428634644, + "learning_rate": 1e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7737604975700378, + "num_tokens": 1754700135.0, + "step": 3661 + }, + { + "epoch": 2.173293768545994, + "grad_norm": 0.5398450493812561, + "learning_rate": 1e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.785171389579773, + "num_tokens": 1755163191.0, + "step": 3662 + }, + { + "epoch": 2.173887240356083, + "grad_norm": 0.5453737378120422, + "learning_rate": 1e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7668521404266357, + "num_tokens": 1755655594.0, + "step": 3663 + }, + { + "epoch": 2.1744807121661722, + "grad_norm": 0.7120149731636047, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7546063661575317, + "num_tokens": 1756089690.0, + "step": 3664 + }, + { + "epoch": 2.1750741839762613, + "grad_norm": 0.5387491583824158, + "learning_rate": 1e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7711808085441589, + "num_tokens": 1756576844.0, + "step": 3665 + }, + { + "epoch": 2.1756676557863504, + "grad_norm": 0.5524898767471313, + "learning_rate": 1e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7762926816940308, + "num_tokens": 1757027272.0, + "step": 3666 + }, + { + "epoch": 2.176261127596439, + "grad_norm": 0.5829726457595825, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.745497465133667, + "num_tokens": 1757467508.0, + "step": 3667 + }, + { + "epoch": 2.176854599406528, + "grad_norm": 0.5418738722801208, + "learning_rate": 1e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7695536613464355, + "num_tokens": 1757935127.0, + "step": 3668 + }, + { + "epoch": 2.177448071216617, + "grad_norm": 0.5714964270591736, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7652970552444458, + "num_tokens": 1758412107.0, + "step": 3669 + }, + { + "epoch": 2.178041543026706, + "grad_norm": 0.5396102666854858, + "learning_rate": 1e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.767341673374176, + "num_tokens": 1758897578.0, + "step": 3670 + }, + { + "epoch": 2.1786350148367952, + "grad_norm": 0.5728226900100708, + "learning_rate": 1e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.777747631072998, + "num_tokens": 1759362382.0, + "step": 3671 + }, + { + "epoch": 2.1792284866468843, + "grad_norm": 0.5225496888160706, + "learning_rate": 1e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7798713445663452, + "num_tokens": 1759878917.0, + "step": 3672 + }, + { + "epoch": 2.1798219584569734, + "grad_norm": 0.5142307281494141, + "learning_rate": 1e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.7799980640411377, + "num_tokens": 1760384297.0, + "step": 3673 + }, + { + "epoch": 2.1804154302670624, + "grad_norm": 0.5208991169929504, + "learning_rate": 1e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7788372039794922, + "num_tokens": 1760870881.0, + "step": 3674 + }, + { + "epoch": 2.1810089020771515, + "grad_norm": 0.5500673055648804, + "learning_rate": 1e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.7737239599227905, + "num_tokens": 1761375687.0, + "step": 3675 + }, + { + "epoch": 2.1816023738872405, + "grad_norm": 0.5567722916603088, + "learning_rate": 1e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7780352234840393, + "num_tokens": 1761859403.0, + "step": 3676 + }, + { + "epoch": 2.182195845697329, + "grad_norm": 0.5739048719406128, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7691722512245178, + "num_tokens": 1762311868.0, + "step": 3677 + }, + { + "epoch": 2.1827893175074182, + "grad_norm": 0.5140058994293213, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7699267268180847, + "num_tokens": 1762829066.0, + "step": 3678 + }, + { + "epoch": 2.1833827893175073, + "grad_norm": 0.5342866778373718, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7667311429977417, + "num_tokens": 1763320623.0, + "step": 3679 + }, + { + "epoch": 2.1839762611275964, + "grad_norm": 0.5536022186279297, + "learning_rate": 1e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7728993892669678, + "num_tokens": 1763778008.0, + "step": 3680 + }, + { + "epoch": 2.1845697329376854, + "grad_norm": 0.5548578500747681, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7597899436950684, + "num_tokens": 1764252733.0, + "step": 3681 + }, + { + "epoch": 2.1851632047477745, + "grad_norm": 0.5466969013214111, + "learning_rate": 1e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.7768836617469788, + "num_tokens": 1764721946.0, + "step": 3682 + }, + { + "epoch": 2.1857566765578635, + "grad_norm": 0.5488786101341248, + "learning_rate": 1e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.7674543261528015, + "num_tokens": 1765194839.0, + "step": 3683 + }, + { + "epoch": 2.1863501483679526, + "grad_norm": 0.5283994078636169, + "learning_rate": 1e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7629300951957703, + "num_tokens": 1765704058.0, + "step": 3684 + }, + { + "epoch": 2.1869436201780417, + "grad_norm": 0.5602375864982605, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.750056803226471, + "num_tokens": 1766146930.0, + "step": 3685 + }, + { + "epoch": 2.1875370919881307, + "grad_norm": 0.5580068230628967, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7633455991744995, + "num_tokens": 1766654994.0, + "step": 3686 + }, + { + "epoch": 2.18813056379822, + "grad_norm": 0.5418792366981506, + "learning_rate": 1e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.7808501720428467, + "num_tokens": 1767142093.0, + "step": 3687 + }, + { + "epoch": 2.1887240356083084, + "grad_norm": 0.5194998979568481, + "learning_rate": 1e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7658714652061462, + "num_tokens": 1767638012.0, + "step": 3688 + }, + { + "epoch": 2.1893175074183975, + "grad_norm": 0.5125023722648621, + "learning_rate": 1e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.7834100723266602, + "num_tokens": 1768151723.0, + "step": 3689 + }, + { + "epoch": 2.1899109792284865, + "grad_norm": 0.5263447761535645, + "learning_rate": 1e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7748023867607117, + "num_tokens": 1768691158.0, + "step": 3690 + }, + { + "epoch": 2.1905044510385756, + "grad_norm": 0.5780717730522156, + "learning_rate": 1e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7615084052085876, + "num_tokens": 1769139519.0, + "step": 3691 + }, + { + "epoch": 2.1910979228486647, + "grad_norm": 0.564243733882904, + "learning_rate": 1e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7577325105667114, + "num_tokens": 1769625816.0, + "step": 3692 + }, + { + "epoch": 2.1916913946587537, + "grad_norm": 0.5347951650619507, + "learning_rate": 1e-06, + "loss": 0.733, + "mean_token_accuracy": 0.767615795135498, + "num_tokens": 1770129705.0, + "step": 3693 + }, + { + "epoch": 2.192284866468843, + "grad_norm": 0.5369188785552979, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.774527907371521, + "num_tokens": 1770615177.0, + "step": 3694 + }, + { + "epoch": 2.192878338278932, + "grad_norm": 0.5689095258712769, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7588558197021484, + "num_tokens": 1771099453.0, + "step": 3695 + }, + { + "epoch": 2.193471810089021, + "grad_norm": 0.5442962050437927, + "learning_rate": 1e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7651348114013672, + "num_tokens": 1771613705.0, + "step": 3696 + }, + { + "epoch": 2.19406528189911, + "grad_norm": 0.5393714904785156, + "learning_rate": 1e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.7843136787414551, + "num_tokens": 1772103337.0, + "step": 3697 + }, + { + "epoch": 2.1946587537091986, + "grad_norm": 0.5643675923347473, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.769949197769165, + "num_tokens": 1772627011.0, + "step": 3698 + }, + { + "epoch": 2.1952522255192877, + "grad_norm": 0.5909314155578613, + "learning_rate": 1e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7703278660774231, + "num_tokens": 1773108509.0, + "step": 3699 + }, + { + "epoch": 2.1958456973293767, + "grad_norm": 0.548327624797821, + "learning_rate": 1e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7646673917770386, + "num_tokens": 1773578761.0, + "step": 3700 + }, + { + "epoch": 2.196439169139466, + "grad_norm": 0.49744713306427, + "learning_rate": 1e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7836930155754089, + "num_tokens": 1774105414.0, + "step": 3701 + }, + { + "epoch": 2.197032640949555, + "grad_norm": 0.5205599069595337, + "learning_rate": 1e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7767969369888306, + "num_tokens": 1774611864.0, + "step": 3702 + }, + { + "epoch": 2.197626112759644, + "grad_norm": 0.5251526832580566, + "learning_rate": 1e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7811002731323242, + "num_tokens": 1775112624.0, + "step": 3703 + }, + { + "epoch": 2.198219584569733, + "grad_norm": 0.5694937705993652, + "learning_rate": 1e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.777045726776123, + "num_tokens": 1775602075.0, + "step": 3704 + }, + { + "epoch": 2.198813056379822, + "grad_norm": 0.5458616614341736, + "learning_rate": 1e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7655351161956787, + "num_tokens": 1776056025.0, + "step": 3705 + }, + { + "epoch": 2.199406528189911, + "grad_norm": 0.5865711569786072, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7578580379486084, + "num_tokens": 1776537283.0, + "step": 3706 + }, + { + "epoch": 2.2, + "grad_norm": 0.5500839948654175, + "learning_rate": 1e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7724687457084656, + "num_tokens": 1776998430.0, + "step": 3707 + }, + { + "epoch": 2.2005934718100892, + "grad_norm": 0.5437154173851013, + "learning_rate": 1e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7778074741363525, + "num_tokens": 1777492893.0, + "step": 3708 + }, + { + "epoch": 2.201186943620178, + "grad_norm": 0.6015698313713074, + "learning_rate": 1e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7818140387535095, + "num_tokens": 1777926104.0, + "step": 3709 + }, + { + "epoch": 2.201780415430267, + "grad_norm": 0.5304538011550903, + "learning_rate": 1e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7806770205497742, + "num_tokens": 1778403831.0, + "step": 3710 + }, + { + "epoch": 2.202373887240356, + "grad_norm": 0.5401588678359985, + "learning_rate": 1e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7693119049072266, + "num_tokens": 1778866108.0, + "step": 3711 + }, + { + "epoch": 2.202967359050445, + "grad_norm": 0.6256006956100464, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7547386288642883, + "num_tokens": 1779297095.0, + "step": 3712 + }, + { + "epoch": 2.203560830860534, + "grad_norm": 0.5347541570663452, + "learning_rate": 1e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7763243317604065, + "num_tokens": 1779795460.0, + "step": 3713 + }, + { + "epoch": 2.204154302670623, + "grad_norm": 0.5253000855445862, + "learning_rate": 1e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7702150344848633, + "num_tokens": 1780272327.0, + "step": 3714 + }, + { + "epoch": 2.2047477744807122, + "grad_norm": 0.5454748868942261, + "learning_rate": 1e-06, + "loss": 0.6905, + "mean_token_accuracy": 0.7783849835395813, + "num_tokens": 1780718385.0, + "step": 3715 + }, + { + "epoch": 2.2053412462908013, + "grad_norm": 0.5560803413391113, + "learning_rate": 1e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7658528685569763, + "num_tokens": 1781202192.0, + "step": 3716 + }, + { + "epoch": 2.2059347181008904, + "grad_norm": 0.5648099184036255, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.762252151966095, + "num_tokens": 1781669167.0, + "step": 3717 + }, + { + "epoch": 2.2065281899109794, + "grad_norm": 0.5589014291763306, + "learning_rate": 1e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.7806038856506348, + "num_tokens": 1782136689.0, + "step": 3718 + }, + { + "epoch": 2.207121661721068, + "grad_norm": 0.5832424163818359, + "learning_rate": 1e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7751337289810181, + "num_tokens": 1782572215.0, + "step": 3719 + }, + { + "epoch": 2.207715133531157, + "grad_norm": 0.5491040945053101, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.761462926864624, + "num_tokens": 1783034584.0, + "step": 3720 + }, + { + "epoch": 2.208308605341246, + "grad_norm": 0.5520188808441162, + "learning_rate": 1e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7568240165710449, + "num_tokens": 1783547024.0, + "step": 3721 + }, + { + "epoch": 2.2089020771513352, + "grad_norm": 0.5494591593742371, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7676829099655151, + "num_tokens": 1784011091.0, + "step": 3722 + }, + { + "epoch": 2.2094955489614243, + "grad_norm": 0.5343278050422668, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7639487385749817, + "num_tokens": 1784519154.0, + "step": 3723 + }, + { + "epoch": 2.2100890207715134, + "grad_norm": 0.5236397385597229, + "learning_rate": 1e-06, + "loss": 0.6527, + "mean_token_accuracy": 0.7904701828956604, + "num_tokens": 1785022023.0, + "step": 3724 + }, + { + "epoch": 2.2106824925816024, + "grad_norm": 0.5632464289665222, + "learning_rate": 1e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7744414210319519, + "num_tokens": 1785490307.0, + "step": 3725 + }, + { + "epoch": 2.2112759643916915, + "grad_norm": 0.5432556867599487, + "learning_rate": 1e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7703161239624023, + "num_tokens": 1785987894.0, + "step": 3726 + }, + { + "epoch": 2.2118694362017806, + "grad_norm": 0.5367899537086487, + "learning_rate": 1e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.7600795030593872, + "num_tokens": 1786462445.0, + "step": 3727 + }, + { + "epoch": 2.2124629080118696, + "grad_norm": 0.5464260578155518, + "learning_rate": 1e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7495481967926025, + "num_tokens": 1786970580.0, + "step": 3728 + }, + { + "epoch": 2.2130563798219587, + "grad_norm": 0.530190646648407, + "learning_rate": 1e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7713330984115601, + "num_tokens": 1787457695.0, + "step": 3729 + }, + { + "epoch": 2.2136498516320473, + "grad_norm": 0.5597018599510193, + "learning_rate": 1e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.7794042825698853, + "num_tokens": 1787907438.0, + "step": 3730 + }, + { + "epoch": 2.2142433234421364, + "grad_norm": 0.5565666556358337, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7765320539474487, + "num_tokens": 1788362446.0, + "step": 3731 + }, + { + "epoch": 2.2148367952522254, + "grad_norm": 0.5555338263511658, + "learning_rate": 1e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7747406959533691, + "num_tokens": 1788887911.0, + "step": 3732 + }, + { + "epoch": 2.2154302670623145, + "grad_norm": 0.5504344701766968, + "learning_rate": 1e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7690883874893188, + "num_tokens": 1789385745.0, + "step": 3733 + }, + { + "epoch": 2.2160237388724036, + "grad_norm": 0.5528745055198669, + "learning_rate": 1e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7605447173118591, + "num_tokens": 1789864209.0, + "step": 3734 + }, + { + "epoch": 2.2166172106824926, + "grad_norm": 0.5419005155563354, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7721891403198242, + "num_tokens": 1790363428.0, + "step": 3735 + }, + { + "epoch": 2.2172106824925817, + "grad_norm": 0.569419801235199, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7710957527160645, + "num_tokens": 1790841528.0, + "step": 3736 + }, + { + "epoch": 2.2178041543026707, + "grad_norm": 0.5448594689369202, + "learning_rate": 1e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7745211124420166, + "num_tokens": 1791324139.0, + "step": 3737 + }, + { + "epoch": 2.21839762611276, + "grad_norm": 0.5184769630432129, + "learning_rate": 1e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.788224458694458, + "num_tokens": 1791800716.0, + "step": 3738 + }, + { + "epoch": 2.2189910979228484, + "grad_norm": 0.601795494556427, + "learning_rate": 1e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.774202287197113, + "num_tokens": 1792218556.0, + "step": 3739 + }, + { + "epoch": 2.2195845697329375, + "grad_norm": 0.5663579702377319, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7614901661872864, + "num_tokens": 1792692844.0, + "step": 3740 + }, + { + "epoch": 2.2201780415430266, + "grad_norm": 0.5432553887367249, + "learning_rate": 1e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.7871423959732056, + "num_tokens": 1793169531.0, + "step": 3741 + }, + { + "epoch": 2.2207715133531156, + "grad_norm": 0.5272479057312012, + "learning_rate": 1e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7689098715782166, + "num_tokens": 1793668301.0, + "step": 3742 + }, + { + "epoch": 2.2213649851632047, + "grad_norm": 0.5647234916687012, + "learning_rate": 1e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7641516923904419, + "num_tokens": 1794152714.0, + "step": 3743 + }, + { + "epoch": 2.2219584569732937, + "grad_norm": 0.5497568249702454, + "learning_rate": 1e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7689287066459656, + "num_tokens": 1794645283.0, + "step": 3744 + }, + { + "epoch": 2.222551928783383, + "grad_norm": 0.5368354916572571, + "learning_rate": 1e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.780838131904602, + "num_tokens": 1795121730.0, + "step": 3745 + }, + { + "epoch": 2.223145400593472, + "grad_norm": 0.5506481528282166, + "learning_rate": 1e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7673406600952148, + "num_tokens": 1795639696.0, + "step": 3746 + }, + { + "epoch": 2.223738872403561, + "grad_norm": 0.5363726615905762, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7727624177932739, + "num_tokens": 1796130958.0, + "step": 3747 + }, + { + "epoch": 2.22433234421365, + "grad_norm": 0.5690662860870361, + "learning_rate": 1e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7500229477882385, + "num_tokens": 1796593537.0, + "step": 3748 + }, + { + "epoch": 2.224925816023739, + "grad_norm": 0.5526880025863647, + "learning_rate": 1e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7663363218307495, + "num_tokens": 1797073239.0, + "step": 3749 + }, + { + "epoch": 2.2255192878338277, + "grad_norm": 0.5588829517364502, + "learning_rate": 1e-06, + "loss": 0.6683, + "mean_token_accuracy": 0.7865846157073975, + "num_tokens": 1797559965.0, + "step": 3750 + }, + { + "epoch": 2.2261127596439167, + "grad_norm": 0.519291877746582, + "learning_rate": 1e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7819259762763977, + "num_tokens": 1798050015.0, + "step": 3751 + }, + { + "epoch": 2.226706231454006, + "grad_norm": 0.572380542755127, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7643671035766602, + "num_tokens": 1798517647.0, + "step": 3752 + }, + { + "epoch": 2.227299703264095, + "grad_norm": 0.5602575540542603, + "learning_rate": 1e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7666012048721313, + "num_tokens": 1798974051.0, + "step": 3753 + }, + { + "epoch": 2.227893175074184, + "grad_norm": 0.5685480237007141, + "learning_rate": 1e-06, + "loss": 0.6932, + "mean_token_accuracy": 0.7794063091278076, + "num_tokens": 1799452681.0, + "step": 3754 + }, + { + "epoch": 2.228486646884273, + "grad_norm": 0.5741866827011108, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7671619653701782, + "num_tokens": 1799904735.0, + "step": 3755 + }, + { + "epoch": 2.229080118694362, + "grad_norm": 0.5282428860664368, + "learning_rate": 1e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7741876244544983, + "num_tokens": 1800411428.0, + "step": 3756 + }, + { + "epoch": 2.229673590504451, + "grad_norm": 0.550405740737915, + "learning_rate": 1e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7755990624427795, + "num_tokens": 1800892437.0, + "step": 3757 + }, + { + "epoch": 2.23026706231454, + "grad_norm": 0.5452971458435059, + "learning_rate": 1e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7807405591011047, + "num_tokens": 1801366791.0, + "step": 3758 + }, + { + "epoch": 2.2308605341246293, + "grad_norm": 0.5552560091018677, + "learning_rate": 1e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7637055516242981, + "num_tokens": 1801852935.0, + "step": 3759 + }, + { + "epoch": 2.231454005934718, + "grad_norm": 0.5421711802482605, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7619556188583374, + "num_tokens": 1802331331.0, + "step": 3760 + }, + { + "epoch": 2.232047477744807, + "grad_norm": 0.5142015218734741, + "learning_rate": 1e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7606481313705444, + "num_tokens": 1802834083.0, + "step": 3761 + }, + { + "epoch": 2.232640949554896, + "grad_norm": 0.584206223487854, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7649704813957214, + "num_tokens": 1803272958.0, + "step": 3762 + }, + { + "epoch": 2.233234421364985, + "grad_norm": 0.5575966835021973, + "learning_rate": 1e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7516006231307983, + "num_tokens": 1803771317.0, + "step": 3763 + }, + { + "epoch": 2.233827893175074, + "grad_norm": 0.5612773299217224, + "learning_rate": 1e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.7792472839355469, + "num_tokens": 1804238237.0, + "step": 3764 + }, + { + "epoch": 2.234421364985163, + "grad_norm": 0.5552130341529846, + "learning_rate": 1e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7709389328956604, + "num_tokens": 1804742700.0, + "step": 3765 + }, + { + "epoch": 2.2350148367952523, + "grad_norm": 0.5785578489303589, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7703697681427002, + "num_tokens": 1805207366.0, + "step": 3766 + }, + { + "epoch": 2.2356083086053413, + "grad_norm": 0.5603469610214233, + "learning_rate": 1e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7638378739356995, + "num_tokens": 1805699558.0, + "step": 3767 + }, + { + "epoch": 2.2362017804154304, + "grad_norm": 0.5537175536155701, + "learning_rate": 1e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.772094190120697, + "num_tokens": 1806162634.0, + "step": 3768 + }, + { + "epoch": 2.2367952522255194, + "grad_norm": 0.5333566665649414, + "learning_rate": 1e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7750875949859619, + "num_tokens": 1806686498.0, + "step": 3769 + }, + { + "epoch": 2.2373887240356085, + "grad_norm": 0.5779489278793335, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7600960731506348, + "num_tokens": 1807162831.0, + "step": 3770 + }, + { + "epoch": 2.237982195845697, + "grad_norm": 0.5408357977867126, + "learning_rate": 1e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7823020815849304, + "num_tokens": 1807627038.0, + "step": 3771 + }, + { + "epoch": 2.238575667655786, + "grad_norm": 0.5405860543251038, + "learning_rate": 1e-06, + "loss": 0.6807, + "mean_token_accuracy": 0.7819014191627502, + "num_tokens": 1808090072.0, + "step": 3772 + }, + { + "epoch": 2.2391691394658753, + "grad_norm": 0.5777102112770081, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7689704298973083, + "num_tokens": 1808519939.0, + "step": 3773 + }, + { + "epoch": 2.2397626112759643, + "grad_norm": 0.5307667851448059, + "learning_rate": 1e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7684025764465332, + "num_tokens": 1809052636.0, + "step": 3774 + }, + { + "epoch": 2.2403560830860534, + "grad_norm": 0.5457172989845276, + "learning_rate": 1e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7560629844665527, + "num_tokens": 1809532497.0, + "step": 3775 + }, + { + "epoch": 2.2409495548961424, + "grad_norm": 0.5609560608863831, + "learning_rate": 1e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7784820795059204, + "num_tokens": 1809959486.0, + "step": 3776 + }, + { + "epoch": 2.2415430267062315, + "grad_norm": 0.5710480213165283, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7601024508476257, + "num_tokens": 1810414219.0, + "step": 3777 + }, + { + "epoch": 2.2421364985163206, + "grad_norm": 0.5416675806045532, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7605572938919067, + "num_tokens": 1810898779.0, + "step": 3778 + }, + { + "epoch": 2.2427299703264096, + "grad_norm": 0.5398604273796082, + "learning_rate": 1e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7764979600906372, + "num_tokens": 1811386779.0, + "step": 3779 + }, + { + "epoch": 2.2433234421364987, + "grad_norm": 0.5365291833877563, + "learning_rate": 1e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.776547908782959, + "num_tokens": 1811899707.0, + "step": 3780 + }, + { + "epoch": 2.2439169139465873, + "grad_norm": 0.5582263469696045, + "learning_rate": 1e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7608332633972168, + "num_tokens": 1812388647.0, + "step": 3781 + }, + { + "epoch": 2.2445103857566764, + "grad_norm": 0.5404694676399231, + "learning_rate": 1e-06, + "loss": 0.7095, + "mean_token_accuracy": 0.7753517627716064, + "num_tokens": 1812850258.0, + "step": 3782 + }, + { + "epoch": 2.2451038575667654, + "grad_norm": 0.552073061466217, + "learning_rate": 1e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.7725484371185303, + "num_tokens": 1813326001.0, + "step": 3783 + }, + { + "epoch": 2.2456973293768545, + "grad_norm": 0.5508838891983032, + "learning_rate": 1e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7710145711898804, + "num_tokens": 1813833446.0, + "step": 3784 + }, + { + "epoch": 2.2462908011869436, + "grad_norm": 0.5566490292549133, + "learning_rate": 1e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7684805989265442, + "num_tokens": 1814281000.0, + "step": 3785 + }, + { + "epoch": 2.2468842729970326, + "grad_norm": 0.5444798469543457, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7595800161361694, + "num_tokens": 1814764732.0, + "step": 3786 + }, + { + "epoch": 2.2474777448071217, + "grad_norm": 0.5141119956970215, + "learning_rate": 1e-06, + "loss": 0.6727, + "mean_token_accuracy": 0.7891952991485596, + "num_tokens": 1815269732.0, + "step": 3787 + }, + { + "epoch": 2.2480712166172108, + "grad_norm": 0.5344054102897644, + "learning_rate": 1e-06, + "loss": 0.6822, + "mean_token_accuracy": 0.783362090587616, + "num_tokens": 1815769498.0, + "step": 3788 + }, + { + "epoch": 2.2486646884273, + "grad_norm": 0.5625359416007996, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.763701319694519, + "num_tokens": 1816230194.0, + "step": 3789 + }, + { + "epoch": 2.249258160237389, + "grad_norm": 0.5801148414611816, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7675591111183167, + "num_tokens": 1816663858.0, + "step": 3790 + }, + { + "epoch": 2.249851632047478, + "grad_norm": 0.5422804951667786, + "learning_rate": 1e-06, + "loss": 0.6565, + "mean_token_accuracy": 0.7879422903060913, + "num_tokens": 1817131668.0, + "step": 3791 + }, + { + "epoch": 2.2504451038575666, + "grad_norm": 0.5558539628982544, + "learning_rate": 1e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7771769762039185, + "num_tokens": 1817624058.0, + "step": 3792 + }, + { + "epoch": 2.2510385756676556, + "grad_norm": 0.5350643396377563, + "learning_rate": 1e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7796894311904907, + "num_tokens": 1818087086.0, + "step": 3793 + }, + { + "epoch": 2.2516320474777447, + "grad_norm": 0.5496528148651123, + "learning_rate": 1e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7757201194763184, + "num_tokens": 1818608675.0, + "step": 3794 + }, + { + "epoch": 2.2522255192878338, + "grad_norm": 0.5481062531471252, + "learning_rate": 1e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.7832180857658386, + "num_tokens": 1819088249.0, + "step": 3795 + }, + { + "epoch": 2.252818991097923, + "grad_norm": 0.5424137115478516, + "learning_rate": 1e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7655695676803589, + "num_tokens": 1819566521.0, + "step": 3796 + }, + { + "epoch": 2.253412462908012, + "grad_norm": 0.5612215995788574, + "learning_rate": 1e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.7715538740158081, + "num_tokens": 1820028264.0, + "step": 3797 + }, + { + "epoch": 2.254005934718101, + "grad_norm": 0.5154773592948914, + "learning_rate": 1e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7735471129417419, + "num_tokens": 1820552674.0, + "step": 3798 + }, + { + "epoch": 2.25459940652819, + "grad_norm": 0.5600945353507996, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7528212070465088, + "num_tokens": 1821032357.0, + "step": 3799 + }, + { + "epoch": 2.255192878338279, + "grad_norm": 0.5551850199699402, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7720149159431458, + "num_tokens": 1821490297.0, + "step": 3800 + }, + { + "epoch": 2.255786350148368, + "grad_norm": 0.5345518589019775, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7585468888282776, + "num_tokens": 1821961698.0, + "step": 3801 + }, + { + "epoch": 2.2563798219584568, + "grad_norm": 0.5487446784973145, + "learning_rate": 1e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.759306788444519, + "num_tokens": 1822438647.0, + "step": 3802 + }, + { + "epoch": 2.256973293768546, + "grad_norm": 0.5491772890090942, + "learning_rate": 1e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7655709981918335, + "num_tokens": 1822907825.0, + "step": 3803 + }, + { + "epoch": 2.257566765578635, + "grad_norm": 0.5425847768783569, + "learning_rate": 1e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7569864988327026, + "num_tokens": 1823370300.0, + "step": 3804 + }, + { + "epoch": 2.258160237388724, + "grad_norm": 0.5410498380661011, + "learning_rate": 1e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7795639038085938, + "num_tokens": 1823853293.0, + "step": 3805 + }, + { + "epoch": 2.258753709198813, + "grad_norm": 0.5829510688781738, + "learning_rate": 1e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7634514570236206, + "num_tokens": 1824295434.0, + "step": 3806 + }, + { + "epoch": 2.259347181008902, + "grad_norm": 0.5523165464401245, + "learning_rate": 1e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7690109014511108, + "num_tokens": 1824782801.0, + "step": 3807 + }, + { + "epoch": 2.259940652818991, + "grad_norm": 0.5407438278198242, + "learning_rate": 1e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.7655044794082642, + "num_tokens": 1825275326.0, + "step": 3808 + }, + { + "epoch": 2.26053412462908, + "grad_norm": 0.5540130138397217, + "learning_rate": 1e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7741080522537231, + "num_tokens": 1825779974.0, + "step": 3809 + }, + { + "epoch": 2.2611275964391693, + "grad_norm": 0.5976507067680359, + "learning_rate": 1e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7677531242370605, + "num_tokens": 1826217652.0, + "step": 3810 + }, + { + "epoch": 2.2617210682492583, + "grad_norm": 0.5791492462158203, + "learning_rate": 1e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7659142017364502, + "num_tokens": 1826692281.0, + "step": 3811 + }, + { + "epoch": 2.2623145400593474, + "grad_norm": 0.5501576662063599, + "learning_rate": 1e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7702402472496033, + "num_tokens": 1827203081.0, + "step": 3812 + }, + { + "epoch": 2.262908011869436, + "grad_norm": 0.5167375206947327, + "learning_rate": 1e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7747619152069092, + "num_tokens": 1827719481.0, + "step": 3813 + }, + { + "epoch": 2.263501483679525, + "grad_norm": 0.5548369884490967, + "learning_rate": 1e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7528050541877747, + "num_tokens": 1828186795.0, + "step": 3814 + }, + { + "epoch": 2.264094955489614, + "grad_norm": 0.5891826152801514, + "learning_rate": 1e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.7776986360549927, + "num_tokens": 1828624277.0, + "step": 3815 + }, + { + "epoch": 2.264688427299703, + "grad_norm": 0.6047608852386475, + "learning_rate": 1e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7759210467338562, + "num_tokens": 1829098436.0, + "step": 3816 + }, + { + "epoch": 2.2652818991097923, + "grad_norm": 0.5486683249473572, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7650241851806641, + "num_tokens": 1829594286.0, + "step": 3817 + }, + { + "epoch": 2.2658753709198813, + "grad_norm": 0.5542730689048767, + "learning_rate": 1e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7692961692810059, + "num_tokens": 1830041900.0, + "step": 3818 + }, + { + "epoch": 2.2664688427299704, + "grad_norm": 0.5254340171813965, + "learning_rate": 1e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.7883239984512329, + "num_tokens": 1830567019.0, + "step": 3819 + }, + { + "epoch": 2.2670623145400595, + "grad_norm": 0.5285918116569519, + "learning_rate": 1e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7706246376037598, + "num_tokens": 1831072342.0, + "step": 3820 + }, + { + "epoch": 2.2676557863501485, + "grad_norm": 0.5461658835411072, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7635058164596558, + "num_tokens": 1831564719.0, + "step": 3821 + }, + { + "epoch": 2.268249258160237, + "grad_norm": 0.5497167110443115, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7729088068008423, + "num_tokens": 1832085060.0, + "step": 3822 + }, + { + "epoch": 2.268842729970326, + "grad_norm": 0.5727159380912781, + "learning_rate": 1e-06, + "loss": 0.6905, + "mean_token_accuracy": 0.7787613868713379, + "num_tokens": 1832558500.0, + "step": 3823 + }, + { + "epoch": 2.2694362017804153, + "grad_norm": 0.549619197845459, + "learning_rate": 1e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7661495208740234, + "num_tokens": 1833057864.0, + "step": 3824 + }, + { + "epoch": 2.2700296735905043, + "grad_norm": 0.5252525806427002, + "learning_rate": 1e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7742272615432739, + "num_tokens": 1833543001.0, + "step": 3825 + }, + { + "epoch": 2.2706231454005934, + "grad_norm": 0.5583546757698059, + "learning_rate": 1e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.7819004654884338, + "num_tokens": 1834020098.0, + "step": 3826 + }, + { + "epoch": 2.2712166172106825, + "grad_norm": 0.5695216059684753, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7548299431800842, + "num_tokens": 1834482773.0, + "step": 3827 + }, + { + "epoch": 2.2718100890207715, + "grad_norm": 0.5264758467674255, + "learning_rate": 1e-06, + "loss": 0.6809, + "mean_token_accuracy": 0.782014012336731, + "num_tokens": 1834966718.0, + "step": 3828 + }, + { + "epoch": 2.2724035608308606, + "grad_norm": 0.5785689949989319, + "learning_rate": 1e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7636498212814331, + "num_tokens": 1835442936.0, + "step": 3829 + }, + { + "epoch": 2.2729970326409497, + "grad_norm": 0.5449944138526917, + "learning_rate": 1e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7698166370391846, + "num_tokens": 1835971557.0, + "step": 3830 + }, + { + "epoch": 2.2735905044510387, + "grad_norm": 0.5635749697685242, + "learning_rate": 1e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7775869369506836, + "num_tokens": 1836422153.0, + "step": 3831 + }, + { + "epoch": 2.274183976261128, + "grad_norm": 0.5359649658203125, + "learning_rate": 1e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.7712037563323975, + "num_tokens": 1836915025.0, + "step": 3832 + }, + { + "epoch": 2.274777448071217, + "grad_norm": 0.546905517578125, + "learning_rate": 1e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7758244276046753, + "num_tokens": 1837403400.0, + "step": 3833 + }, + { + "epoch": 2.2753709198813055, + "grad_norm": 0.5684008598327637, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.766200065612793, + "num_tokens": 1837855026.0, + "step": 3834 + }, + { + "epoch": 2.2759643916913945, + "grad_norm": 0.5331974029541016, + "learning_rate": 1e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7733707427978516, + "num_tokens": 1838356378.0, + "step": 3835 + }, + { + "epoch": 2.2765578635014836, + "grad_norm": 0.5420063734054565, + "learning_rate": 1e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.7759275436401367, + "num_tokens": 1838851951.0, + "step": 3836 + }, + { + "epoch": 2.2771513353115727, + "grad_norm": 0.5450705289840698, + "learning_rate": 1e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7723298668861389, + "num_tokens": 1839320123.0, + "step": 3837 + }, + { + "epoch": 2.2777448071216617, + "grad_norm": 0.5555652379989624, + "learning_rate": 1e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7663918137550354, + "num_tokens": 1839820587.0, + "step": 3838 + }, + { + "epoch": 2.278338278931751, + "grad_norm": 0.5581540465354919, + "learning_rate": 1e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.7778863906860352, + "num_tokens": 1840332101.0, + "step": 3839 + }, + { + "epoch": 2.27893175074184, + "grad_norm": 0.5569396018981934, + "learning_rate": 1e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7682157754898071, + "num_tokens": 1840812270.0, + "step": 3840 + }, + { + "epoch": 2.279525222551929, + "grad_norm": 0.5557010173797607, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.766983151435852, + "num_tokens": 1841303162.0, + "step": 3841 + }, + { + "epoch": 2.280118694362018, + "grad_norm": 0.5602604150772095, + "learning_rate": 1e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.7763767242431641, + "num_tokens": 1841784259.0, + "step": 3842 + }, + { + "epoch": 2.2807121661721066, + "grad_norm": 0.6001350283622742, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7724514603614807, + "num_tokens": 1842208018.0, + "step": 3843 + }, + { + "epoch": 2.2813056379821957, + "grad_norm": 0.5676200985908508, + "learning_rate": 1e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.7714210748672485, + "num_tokens": 1842670578.0, + "step": 3844 + }, + { + "epoch": 2.2818991097922847, + "grad_norm": 0.5876781940460205, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.765927791595459, + "num_tokens": 1843121403.0, + "step": 3845 + }, + { + "epoch": 2.282492581602374, + "grad_norm": 0.560387134552002, + "learning_rate": 1e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7674272060394287, + "num_tokens": 1843572722.0, + "step": 3846 + }, + { + "epoch": 2.283086053412463, + "grad_norm": 0.5380437970161438, + "learning_rate": 1e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7714732885360718, + "num_tokens": 1844046155.0, + "step": 3847 + }, + { + "epoch": 2.283679525222552, + "grad_norm": 0.5438279509544373, + "learning_rate": 1e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7662065029144287, + "num_tokens": 1844547170.0, + "step": 3848 + }, + { + "epoch": 2.284272997032641, + "grad_norm": 0.5685710310935974, + "learning_rate": 1e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7659210562705994, + "num_tokens": 1844998793.0, + "step": 3849 + }, + { + "epoch": 2.28486646884273, + "grad_norm": 0.5103175044059753, + "learning_rate": 1e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7762271165847778, + "num_tokens": 1845531585.0, + "step": 3850 + }, + { + "epoch": 2.285459940652819, + "grad_norm": 0.5399824380874634, + "learning_rate": 1e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7703098654747009, + "num_tokens": 1846019232.0, + "step": 3851 + }, + { + "epoch": 2.286053412462908, + "grad_norm": 0.5058450698852539, + "learning_rate": 1e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.7911604046821594, + "num_tokens": 1846548389.0, + "step": 3852 + }, + { + "epoch": 2.2866468842729972, + "grad_norm": 0.5534268021583557, + "learning_rate": 1e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7810590863227844, + "num_tokens": 1846982435.0, + "step": 3853 + }, + { + "epoch": 2.2872403560830863, + "grad_norm": 0.5446163415908813, + "learning_rate": 1e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7635868191719055, + "num_tokens": 1847479690.0, + "step": 3854 + }, + { + "epoch": 2.287833827893175, + "grad_norm": 0.5888944864273071, + "learning_rate": 1e-06, + "loss": 0.6712, + "mean_token_accuracy": 0.7838559150695801, + "num_tokens": 1847908432.0, + "step": 3855 + }, + { + "epoch": 2.288427299703264, + "grad_norm": 0.5426392555236816, + "learning_rate": 1e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7640723586082458, + "num_tokens": 1848406850.0, + "step": 3856 + }, + { + "epoch": 2.289020771513353, + "grad_norm": 0.5069498419761658, + "learning_rate": 1e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.7811823487281799, + "num_tokens": 1848953636.0, + "step": 3857 + }, + { + "epoch": 2.289614243323442, + "grad_norm": 0.5630350708961487, + "learning_rate": 1e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7577329277992249, + "num_tokens": 1849408551.0, + "step": 3858 + }, + { + "epoch": 2.290207715133531, + "grad_norm": 0.5271462202072144, + "learning_rate": 1e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7734816074371338, + "num_tokens": 1849880077.0, + "step": 3859 + }, + { + "epoch": 2.2908011869436202, + "grad_norm": 0.6605808138847351, + "learning_rate": 1e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7719155550003052, + "num_tokens": 1850365196.0, + "step": 3860 + }, + { + "epoch": 2.2913946587537093, + "grad_norm": 0.5656191110610962, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7699502110481262, + "num_tokens": 1850840076.0, + "step": 3861 + }, + { + "epoch": 2.2919881305637984, + "grad_norm": 0.527463972568512, + "learning_rate": 1e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.7766338586807251, + "num_tokens": 1851341074.0, + "step": 3862 + }, + { + "epoch": 2.2925816023738874, + "grad_norm": 0.5498812198638916, + "learning_rate": 1e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.780123770236969, + "num_tokens": 1851802904.0, + "step": 3863 + }, + { + "epoch": 2.293175074183976, + "grad_norm": 0.5544865727424622, + "learning_rate": 1e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7736285924911499, + "num_tokens": 1852286841.0, + "step": 3864 + }, + { + "epoch": 2.293768545994065, + "grad_norm": 0.5436771512031555, + "learning_rate": 1e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.7854135036468506, + "num_tokens": 1852764253.0, + "step": 3865 + }, + { + "epoch": 2.294362017804154, + "grad_norm": 0.5445511341094971, + "learning_rate": 1e-06, + "loss": 0.6921, + "mean_token_accuracy": 0.7795390486717224, + "num_tokens": 1853247607.0, + "step": 3866 + }, + { + "epoch": 2.2949554896142432, + "grad_norm": 0.5583537817001343, + "learning_rate": 1e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7745893001556396, + "num_tokens": 1853719344.0, + "step": 3867 + }, + { + "epoch": 2.2955489614243323, + "grad_norm": 0.5684219002723694, + "learning_rate": 1e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7760677933692932, + "num_tokens": 1854181848.0, + "step": 3868 + }, + { + "epoch": 2.2961424332344214, + "grad_norm": 0.5670471787452698, + "learning_rate": 1e-06, + "loss": 0.6689, + "mean_token_accuracy": 0.7844316363334656, + "num_tokens": 1854651526.0, + "step": 3869 + }, + { + "epoch": 2.2967359050445104, + "grad_norm": 0.5438379645347595, + "learning_rate": 1e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7760176062583923, + "num_tokens": 1855148316.0, + "step": 3870 + }, + { + "epoch": 2.2973293768545995, + "grad_norm": 0.5863308310508728, + "learning_rate": 1e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.772917628288269, + "num_tokens": 1855619657.0, + "step": 3871 + }, + { + "epoch": 2.2979228486646885, + "grad_norm": 0.5703233480453491, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7660872936248779, + "num_tokens": 1856063675.0, + "step": 3872 + }, + { + "epoch": 2.2985163204747776, + "grad_norm": 0.5684253573417664, + "learning_rate": 1e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7594366073608398, + "num_tokens": 1856537473.0, + "step": 3873 + }, + { + "epoch": 2.2991097922848667, + "grad_norm": 0.5491806268692017, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7713364362716675, + "num_tokens": 1857039847.0, + "step": 3874 + }, + { + "epoch": 2.2997032640949557, + "grad_norm": 0.5315622091293335, + "learning_rate": 1e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7699308395385742, + "num_tokens": 1857546524.0, + "step": 3875 + }, + { + "epoch": 2.3002967359050444, + "grad_norm": 0.539280354976654, + "learning_rate": 1e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7766627073287964, + "num_tokens": 1858049657.0, + "step": 3876 + }, + { + "epoch": 2.3008902077151334, + "grad_norm": 0.5470167398452759, + "learning_rate": 1e-06, + "loss": 0.6882, + "mean_token_accuracy": 0.7804633378982544, + "num_tokens": 1858504948.0, + "step": 3877 + }, + { + "epoch": 2.3014836795252225, + "grad_norm": 0.5977206826210022, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7582501769065857, + "num_tokens": 1858952148.0, + "step": 3878 + }, + { + "epoch": 2.3020771513353115, + "grad_norm": 0.6068907976150513, + "learning_rate": 1e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7597353458404541, + "num_tokens": 1859408649.0, + "step": 3879 + }, + { + "epoch": 2.3026706231454006, + "grad_norm": 0.5616665482521057, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7648653984069824, + "num_tokens": 1859893380.0, + "step": 3880 + }, + { + "epoch": 2.3032640949554897, + "grad_norm": 0.5932428240776062, + "learning_rate": 1e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7610710263252258, + "num_tokens": 1860362221.0, + "step": 3881 + }, + { + "epoch": 2.3038575667655787, + "grad_norm": 0.5723780989646912, + "learning_rate": 1e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7732488512992859, + "num_tokens": 1860849852.0, + "step": 3882 + }, + { + "epoch": 2.304451038575668, + "grad_norm": 0.5440641641616821, + "learning_rate": 1e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.772594153881073, + "num_tokens": 1861323954.0, + "step": 3883 + }, + { + "epoch": 2.305044510385757, + "grad_norm": 0.5556223392486572, + "learning_rate": 1e-06, + "loss": 0.6793, + "mean_token_accuracy": 0.7814335823059082, + "num_tokens": 1861812415.0, + "step": 3884 + }, + { + "epoch": 2.3056379821958455, + "grad_norm": 0.5418269038200378, + "learning_rate": 1e-06, + "loss": 0.6923, + "mean_token_accuracy": 0.7808616161346436, + "num_tokens": 1862323558.0, + "step": 3885 + }, + { + "epoch": 2.3062314540059345, + "grad_norm": 0.5304699540138245, + "learning_rate": 1e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7734448909759521, + "num_tokens": 1862830064.0, + "step": 3886 + }, + { + "epoch": 2.3068249258160236, + "grad_norm": 0.5779682993888855, + "learning_rate": 1e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7647273540496826, + "num_tokens": 1863288257.0, + "step": 3887 + }, + { + "epoch": 2.3074183976261127, + "grad_norm": 0.5565757155418396, + "learning_rate": 1e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7612560987472534, + "num_tokens": 1863767768.0, + "step": 3888 + }, + { + "epoch": 2.3080118694362017, + "grad_norm": 0.5323523879051208, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7711119651794434, + "num_tokens": 1864245835.0, + "step": 3889 + }, + { + "epoch": 2.308605341246291, + "grad_norm": 0.5575748085975647, + "learning_rate": 1e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7713491916656494, + "num_tokens": 1864709083.0, + "step": 3890 + }, + { + "epoch": 2.30919881305638, + "grad_norm": 0.5479755997657776, + "learning_rate": 1e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7658025622367859, + "num_tokens": 1865200084.0, + "step": 3891 + }, + { + "epoch": 2.309792284866469, + "grad_norm": 0.5567826628684998, + "learning_rate": 1e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.7830703258514404, + "num_tokens": 1865667884.0, + "step": 3892 + }, + { + "epoch": 2.310385756676558, + "grad_norm": 0.5770797729492188, + "learning_rate": 1e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.767480194568634, + "num_tokens": 1866139348.0, + "step": 3893 + }, + { + "epoch": 2.310979228486647, + "grad_norm": 0.5779449939727783, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7738741636276245, + "num_tokens": 1866642245.0, + "step": 3894 + }, + { + "epoch": 2.311572700296736, + "grad_norm": 0.5420065522193909, + "learning_rate": 1e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7638553977012634, + "num_tokens": 1867129482.0, + "step": 3895 + }, + { + "epoch": 2.3121661721068247, + "grad_norm": 0.5615834593772888, + "learning_rate": 1e-06, + "loss": 0.7066, + "mean_token_accuracy": 0.7720931768417358, + "num_tokens": 1867633725.0, + "step": 3896 + }, + { + "epoch": 2.312759643916914, + "grad_norm": 0.5489416122436523, + "learning_rate": 1e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7751318216323853, + "num_tokens": 1868099798.0, + "step": 3897 + }, + { + "epoch": 2.313353115727003, + "grad_norm": 0.5443056225776672, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.765906035900116, + "num_tokens": 1868580126.0, + "step": 3898 + }, + { + "epoch": 2.313946587537092, + "grad_norm": 0.5424150824546814, + "learning_rate": 1e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.7865456342697144, + "num_tokens": 1869056472.0, + "step": 3899 + }, + { + "epoch": 2.314540059347181, + "grad_norm": 0.5328260064125061, + "learning_rate": 1e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7747364044189453, + "num_tokens": 1869534463.0, + "step": 3900 + }, + { + "epoch": 2.31513353115727, + "grad_norm": 0.5286770462989807, + "learning_rate": 1e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7658286690711975, + "num_tokens": 1870052020.0, + "step": 3901 + }, + { + "epoch": 2.315727002967359, + "grad_norm": 0.544376790523529, + "learning_rate": 1e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.77403324842453, + "num_tokens": 1870550915.0, + "step": 3902 + }, + { + "epoch": 2.316320474777448, + "grad_norm": 0.6078777313232422, + "learning_rate": 1e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7563265562057495, + "num_tokens": 1870979077.0, + "step": 3903 + }, + { + "epoch": 2.3169139465875372, + "grad_norm": 0.529268205165863, + "learning_rate": 1e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.7780829071998596, + "num_tokens": 1871487205.0, + "step": 3904 + }, + { + "epoch": 2.3175074183976263, + "grad_norm": 0.537574291229248, + "learning_rate": 1e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.7757536172866821, + "num_tokens": 1871994090.0, + "step": 3905 + }, + { + "epoch": 2.318100890207715, + "grad_norm": 0.5256884098052979, + "learning_rate": 1e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7744557857513428, + "num_tokens": 1872494544.0, + "step": 3906 + }, + { + "epoch": 2.318694362017804, + "grad_norm": 0.5552887320518494, + "learning_rate": 1e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7795737981796265, + "num_tokens": 1872980229.0, + "step": 3907 + }, + { + "epoch": 2.319287833827893, + "grad_norm": 0.5602490901947021, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7617630362510681, + "num_tokens": 1873468761.0, + "step": 3908 + }, + { + "epoch": 2.319881305637982, + "grad_norm": 0.5808425545692444, + "learning_rate": 1e-06, + "loss": 0.6811, + "mean_token_accuracy": 0.7807630896568298, + "num_tokens": 1873919881.0, + "step": 3909 + }, + { + "epoch": 2.320474777448071, + "grad_norm": 0.566398024559021, + "learning_rate": 1e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7655858993530273, + "num_tokens": 1874406363.0, + "step": 3910 + }, + { + "epoch": 2.3210682492581602, + "grad_norm": 0.5168448090553284, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7639920711517334, + "num_tokens": 1874933128.0, + "step": 3911 + }, + { + "epoch": 2.3216617210682493, + "grad_norm": 0.5417001843452454, + "learning_rate": 1e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7786027193069458, + "num_tokens": 1875405841.0, + "step": 3912 + }, + { + "epoch": 2.3222551928783384, + "grad_norm": 0.5270395874977112, + "learning_rate": 1e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7662703990936279, + "num_tokens": 1875907757.0, + "step": 3913 + }, + { + "epoch": 2.3228486646884274, + "grad_norm": 0.5556724667549133, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7656404972076416, + "num_tokens": 1876387000.0, + "step": 3914 + }, + { + "epoch": 2.3234421364985165, + "grad_norm": 0.5564756393432617, + "learning_rate": 1e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7603195905685425, + "num_tokens": 1876855345.0, + "step": 3915 + }, + { + "epoch": 2.3240356083086056, + "grad_norm": 0.5546430349349976, + "learning_rate": 1e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7662466764450073, + "num_tokens": 1877328607.0, + "step": 3916 + }, + { + "epoch": 2.324629080118694, + "grad_norm": 0.5559896230697632, + "learning_rate": 1e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.758722722530365, + "num_tokens": 1877802101.0, + "step": 3917 + }, + { + "epoch": 2.3252225519287832, + "grad_norm": 0.5709952712059021, + "learning_rate": 1e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.773507833480835, + "num_tokens": 1878266397.0, + "step": 3918 + }, + { + "epoch": 2.3258160237388723, + "grad_norm": 0.5398157835006714, + "learning_rate": 1e-06, + "loss": 0.6767, + "mean_token_accuracy": 0.7827488780021667, + "num_tokens": 1878741785.0, + "step": 3919 + }, + { + "epoch": 2.3264094955489614, + "grad_norm": 0.5754505395889282, + "learning_rate": 1e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7600077390670776, + "num_tokens": 1879215968.0, + "step": 3920 + }, + { + "epoch": 2.3270029673590504, + "grad_norm": 0.5918354392051697, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7688244581222534, + "num_tokens": 1879677622.0, + "step": 3921 + }, + { + "epoch": 2.3275964391691395, + "grad_norm": 0.5584729909896851, + "learning_rate": 1e-06, + "loss": 0.7898, + "mean_token_accuracy": 0.7533242702484131, + "num_tokens": 1880151756.0, + "step": 3922 + }, + { + "epoch": 2.3281899109792286, + "grad_norm": 0.5290592312812805, + "learning_rate": 1e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.776721715927124, + "num_tokens": 1880674758.0, + "step": 3923 + }, + { + "epoch": 2.3287833827893176, + "grad_norm": 0.5081532001495361, + "learning_rate": 1e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7743130922317505, + "num_tokens": 1881207827.0, + "step": 3924 + }, + { + "epoch": 2.3293768545994067, + "grad_norm": 0.5635585784912109, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7608179450035095, + "num_tokens": 1881682373.0, + "step": 3925 + }, + { + "epoch": 2.3299703264094953, + "grad_norm": 0.5425196886062622, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7639825940132141, + "num_tokens": 1882185832.0, + "step": 3926 + }, + { + "epoch": 2.3305637982195844, + "grad_norm": 0.5462527275085449, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7606626749038696, + "num_tokens": 1882709677.0, + "step": 3927 + }, + { + "epoch": 2.3311572700296734, + "grad_norm": 0.5558508634567261, + "learning_rate": 1e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7643160223960876, + "num_tokens": 1883156861.0, + "step": 3928 + }, + { + "epoch": 2.3317507418397625, + "grad_norm": 0.5878053307533264, + "learning_rate": 1e-06, + "loss": 0.78, + "mean_token_accuracy": 0.754618763923645, + "num_tokens": 1883616912.0, + "step": 3929 + }, + { + "epoch": 2.3323442136498516, + "grad_norm": 0.5703604221343994, + "learning_rate": 1e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7774841785430908, + "num_tokens": 1884043931.0, + "step": 3930 + }, + { + "epoch": 2.3329376854599406, + "grad_norm": 0.551536500453949, + "learning_rate": 1e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7543888688087463, + "num_tokens": 1884507508.0, + "step": 3931 + }, + { + "epoch": 2.3335311572700297, + "grad_norm": 0.5743581056594849, + "learning_rate": 1e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7746384143829346, + "num_tokens": 1884943668.0, + "step": 3932 + }, + { + "epoch": 2.3341246290801188, + "grad_norm": 0.5464922785758972, + "learning_rate": 1e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7816156148910522, + "num_tokens": 1885404507.0, + "step": 3933 + }, + { + "epoch": 2.334718100890208, + "grad_norm": 0.5408255457878113, + "learning_rate": 1e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7644189596176147, + "num_tokens": 1885887554.0, + "step": 3934 + }, + { + "epoch": 2.335311572700297, + "grad_norm": 0.5439229607582092, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7671264410018921, + "num_tokens": 1886337280.0, + "step": 3935 + }, + { + "epoch": 2.335905044510386, + "grad_norm": 0.5618768334388733, + "learning_rate": 1e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7654309272766113, + "num_tokens": 1886771549.0, + "step": 3936 + }, + { + "epoch": 2.336498516320475, + "grad_norm": 0.5234859585762024, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7764906883239746, + "num_tokens": 1887252898.0, + "step": 3937 + }, + { + "epoch": 2.3370919881305636, + "grad_norm": 0.5327508449554443, + "learning_rate": 1e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7717112302780151, + "num_tokens": 1887731300.0, + "step": 3938 + }, + { + "epoch": 2.3376854599406527, + "grad_norm": 0.5383269786834717, + "learning_rate": 1e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7695274353027344, + "num_tokens": 1888192842.0, + "step": 3939 + }, + { + "epoch": 2.3382789317507418, + "grad_norm": 0.5273813009262085, + "learning_rate": 1e-06, + "loss": 0.6783, + "mean_token_accuracy": 0.784074068069458, + "num_tokens": 1888675072.0, + "step": 3940 + }, + { + "epoch": 2.338872403560831, + "grad_norm": 0.556577742099762, + "learning_rate": 1e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7618991136550903, + "num_tokens": 1889188393.0, + "step": 3941 + }, + { + "epoch": 2.33946587537092, + "grad_norm": 0.5469996333122253, + "learning_rate": 1e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.7800730466842651, + "num_tokens": 1889650211.0, + "step": 3942 + }, + { + "epoch": 2.340059347181009, + "grad_norm": 0.5478249192237854, + "learning_rate": 1e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7738127112388611, + "num_tokens": 1890107934.0, + "step": 3943 + }, + { + "epoch": 2.340652818991098, + "grad_norm": 0.541589617729187, + "learning_rate": 1e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7770911455154419, + "num_tokens": 1890629283.0, + "step": 3944 + }, + { + "epoch": 2.341246290801187, + "grad_norm": 0.5704174041748047, + "learning_rate": 1e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7770118713378906, + "num_tokens": 1891117647.0, + "step": 3945 + }, + { + "epoch": 2.341839762611276, + "grad_norm": 0.5551863312721252, + "learning_rate": 1e-06, + "loss": 0.6888, + "mean_token_accuracy": 0.7793200612068176, + "num_tokens": 1891575174.0, + "step": 3946 + }, + { + "epoch": 2.3424332344213648, + "grad_norm": 0.5594547390937805, + "learning_rate": 1e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7698455452919006, + "num_tokens": 1892055515.0, + "step": 3947 + }, + { + "epoch": 2.343026706231454, + "grad_norm": 0.5552281737327576, + "learning_rate": 1e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7769756317138672, + "num_tokens": 1892512073.0, + "step": 3948 + }, + { + "epoch": 2.343620178041543, + "grad_norm": 0.518054187297821, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7586731910705566, + "num_tokens": 1893015470.0, + "step": 3949 + }, + { + "epoch": 2.344213649851632, + "grad_norm": 0.5936706066131592, + "learning_rate": 1e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7658281326293945, + "num_tokens": 1893464474.0, + "step": 3950 + }, + { + "epoch": 2.344807121661721, + "grad_norm": 0.528569757938385, + "learning_rate": 1e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.7823896408081055, + "num_tokens": 1893963834.0, + "step": 3951 + }, + { + "epoch": 2.34540059347181, + "grad_norm": 0.5313788652420044, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7653048038482666, + "num_tokens": 1894457453.0, + "step": 3952 + }, + { + "epoch": 2.345994065281899, + "grad_norm": 0.5199040174484253, + "learning_rate": 1e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7694780826568604, + "num_tokens": 1894956966.0, + "step": 3953 + }, + { + "epoch": 2.346587537091988, + "grad_norm": 0.5455551147460938, + "learning_rate": 1e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7789698839187622, + "num_tokens": 1895445970.0, + "step": 3954 + }, + { + "epoch": 2.3471810089020773, + "grad_norm": 0.528637707233429, + "learning_rate": 1e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.7808632850646973, + "num_tokens": 1895937001.0, + "step": 3955 + }, + { + "epoch": 2.3477744807121663, + "grad_norm": 0.5359262228012085, + "learning_rate": 1e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7811858057975769, + "num_tokens": 1896427130.0, + "step": 3956 + }, + { + "epoch": 2.3483679525222554, + "grad_norm": 0.5623412728309631, + "learning_rate": 1e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7671871185302734, + "num_tokens": 1896894476.0, + "step": 3957 + }, + { + "epoch": 2.3489614243323444, + "grad_norm": 0.536217212677002, + "learning_rate": 1e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7614717483520508, + "num_tokens": 1897380837.0, + "step": 3958 + }, + { + "epoch": 2.349554896142433, + "grad_norm": 0.5496490001678467, + "learning_rate": 1e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7815159559249878, + "num_tokens": 1897849202.0, + "step": 3959 + }, + { + "epoch": 2.350148367952522, + "grad_norm": 0.5458109378814697, + "learning_rate": 1e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7790895700454712, + "num_tokens": 1898317832.0, + "step": 3960 + }, + { + "epoch": 2.350741839762611, + "grad_norm": 0.5541232824325562, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.768617570400238, + "num_tokens": 1898783019.0, + "step": 3961 + }, + { + "epoch": 2.3513353115727003, + "grad_norm": 0.5463823080062866, + "learning_rate": 1e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.789887547492981, + "num_tokens": 1899293919.0, + "step": 3962 + }, + { + "epoch": 2.3519287833827893, + "grad_norm": 0.5772910714149475, + "learning_rate": 1e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.774634599685669, + "num_tokens": 1899724502.0, + "step": 3963 + }, + { + "epoch": 2.3525222551928784, + "grad_norm": 0.5329845547676086, + "learning_rate": 1e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7715450525283813, + "num_tokens": 1900230288.0, + "step": 3964 + }, + { + "epoch": 2.3531157270029674, + "grad_norm": 0.5768800377845764, + "learning_rate": 1e-06, + "loss": 0.6684, + "mean_token_accuracy": 0.7838658690452576, + "num_tokens": 1900678165.0, + "step": 3965 + }, + { + "epoch": 2.3537091988130565, + "grad_norm": 0.5218232274055481, + "learning_rate": 1e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7799966335296631, + "num_tokens": 1901170226.0, + "step": 3966 + }, + { + "epoch": 2.3543026706231456, + "grad_norm": 0.5407451391220093, + "learning_rate": 1e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7694965600967407, + "num_tokens": 1901674197.0, + "step": 3967 + }, + { + "epoch": 2.354896142433234, + "grad_norm": 0.5435091853141785, + "learning_rate": 1e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7766265869140625, + "num_tokens": 1902140742.0, + "step": 3968 + }, + { + "epoch": 2.3554896142433233, + "grad_norm": 0.5547896027565002, + "learning_rate": 1e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7633176445960999, + "num_tokens": 1902587129.0, + "step": 3969 + }, + { + "epoch": 2.3560830860534123, + "grad_norm": 0.5482670068740845, + "learning_rate": 1e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.7816358208656311, + "num_tokens": 1903083243.0, + "step": 3970 + }, + { + "epoch": 2.3566765578635014, + "grad_norm": 0.5629475712776184, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7657999992370605, + "num_tokens": 1903549227.0, + "step": 3971 + }, + { + "epoch": 2.3572700296735905, + "grad_norm": 0.5330225825309753, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7583472728729248, + "num_tokens": 1904096581.0, + "step": 3972 + }, + { + "epoch": 2.3578635014836795, + "grad_norm": 0.5444706082344055, + "learning_rate": 1e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7610111832618713, + "num_tokens": 1904568863.0, + "step": 3973 + }, + { + "epoch": 2.3584569732937686, + "grad_norm": 0.5509290099143982, + "learning_rate": 1e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.7786627411842346, + "num_tokens": 1905090803.0, + "step": 3974 + }, + { + "epoch": 2.3590504451038576, + "grad_norm": 0.5717368125915527, + "learning_rate": 1e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7792608141899109, + "num_tokens": 1905534491.0, + "step": 3975 + }, + { + "epoch": 2.3596439169139467, + "grad_norm": 0.6150004863739014, + "learning_rate": 1e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7668445706367493, + "num_tokens": 1905965019.0, + "step": 3976 + }, + { + "epoch": 2.3602373887240358, + "grad_norm": 0.5816900134086609, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7699037790298462, + "num_tokens": 1906434590.0, + "step": 3977 + }, + { + "epoch": 2.360830860534125, + "grad_norm": 0.5488670468330383, + "learning_rate": 1e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7596681714057922, + "num_tokens": 1906931281.0, + "step": 3978 + }, + { + "epoch": 2.361424332344214, + "grad_norm": 0.5655660033226013, + "learning_rate": 1e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7691506147384644, + "num_tokens": 1907383278.0, + "step": 3979 + }, + { + "epoch": 2.3620178041543025, + "grad_norm": 0.5475654602050781, + "learning_rate": 1e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.7785007953643799, + "num_tokens": 1907873747.0, + "step": 3980 + }, + { + "epoch": 2.3626112759643916, + "grad_norm": 0.5414785742759705, + "learning_rate": 1e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.785524308681488, + "num_tokens": 1908365305.0, + "step": 3981 + }, + { + "epoch": 2.3632047477744806, + "grad_norm": 0.5610886216163635, + "learning_rate": 1e-06, + "loss": 0.7397, + "mean_token_accuracy": 0.7666867971420288, + "num_tokens": 1908823858.0, + "step": 3982 + }, + { + "epoch": 2.3637982195845697, + "grad_norm": 0.5429356694221497, + "learning_rate": 1e-06, + "loss": 0.683, + "mean_token_accuracy": 0.7805207967758179, + "num_tokens": 1909294033.0, + "step": 3983 + }, + { + "epoch": 2.3643916913946588, + "grad_norm": 0.5218015313148499, + "learning_rate": 1e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7725939750671387, + "num_tokens": 1909784427.0, + "step": 3984 + }, + { + "epoch": 2.364985163204748, + "grad_norm": 0.5715532898902893, + "learning_rate": 1e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7752363681793213, + "num_tokens": 1910244986.0, + "step": 3985 + }, + { + "epoch": 2.365578635014837, + "grad_norm": 0.5771734714508057, + "learning_rate": 1e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.7741920948028564, + "num_tokens": 1910721367.0, + "step": 3986 + }, + { + "epoch": 2.366172106824926, + "grad_norm": 0.5310767889022827, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7686780095100403, + "num_tokens": 1911211163.0, + "step": 3987 + }, + { + "epoch": 2.366765578635015, + "grad_norm": 0.570146918296814, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.764279842376709, + "num_tokens": 1911681960.0, + "step": 3988 + }, + { + "epoch": 2.3673590504451036, + "grad_norm": 0.5982980728149414, + "learning_rate": 1e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.7762236595153809, + "num_tokens": 1912167570.0, + "step": 3989 + }, + { + "epoch": 2.3679525222551927, + "grad_norm": 0.5327185392379761, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7689938545227051, + "num_tokens": 1912651967.0, + "step": 3990 + }, + { + "epoch": 2.3685459940652818, + "grad_norm": 0.5593534111976624, + "learning_rate": 1e-06, + "loss": 0.693, + "mean_token_accuracy": 0.775530219078064, + "num_tokens": 1913148506.0, + "step": 3991 + }, + { + "epoch": 2.369139465875371, + "grad_norm": 0.5801378488540649, + "learning_rate": 1e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7770954966545105, + "num_tokens": 1913661553.0, + "step": 3992 + }, + { + "epoch": 2.36973293768546, + "grad_norm": 0.5571126937866211, + "learning_rate": 1e-06, + "loss": 0.7262, + "mean_token_accuracy": 0.7686227560043335, + "num_tokens": 1914127837.0, + "step": 3993 + }, + { + "epoch": 2.370326409495549, + "grad_norm": 0.5283412337303162, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7668695449829102, + "num_tokens": 1914605327.0, + "step": 3994 + }, + { + "epoch": 2.370919881305638, + "grad_norm": 0.5872597694396973, + "learning_rate": 1e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7697800397872925, + "num_tokens": 1915062174.0, + "step": 3995 + }, + { + "epoch": 2.371513353115727, + "grad_norm": 0.5501189827919006, + "learning_rate": 1e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7791210412979126, + "num_tokens": 1915598457.0, + "step": 3996 + }, + { + "epoch": 2.372106824925816, + "grad_norm": 0.5338958501815796, + "learning_rate": 1e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7701742649078369, + "num_tokens": 1916080324.0, + "step": 3997 + }, + { + "epoch": 2.372700296735905, + "grad_norm": 0.5753322839736938, + "learning_rate": 1e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7672117948532104, + "num_tokens": 1916567736.0, + "step": 3998 + }, + { + "epoch": 2.3732937685459943, + "grad_norm": 0.5776742100715637, + "learning_rate": 1e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.762693464756012, + "num_tokens": 1917063518.0, + "step": 3999 + }, + { + "epoch": 2.373887240356083, + "grad_norm": 0.5684077143669128, + "learning_rate": 1e-06, + "loss": 0.707, + "mean_token_accuracy": 0.7740705013275146, + "num_tokens": 1917545168.0, + "step": 4000 + }, + { + "epoch": 2.374480712166172, + "grad_norm": 0.5332441329956055, + "learning_rate": 1e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7740324139595032, + "num_tokens": 1918038370.0, + "step": 4001 + }, + { + "epoch": 2.375074183976261, + "grad_norm": 0.5665338039398193, + "learning_rate": 1e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.7744184136390686, + "num_tokens": 1918549368.0, + "step": 4002 + }, + { + "epoch": 2.37566765578635, + "grad_norm": 0.5502590537071228, + "learning_rate": 1e-06, + "loss": 0.6483, + "mean_token_accuracy": 0.7903914451599121, + "num_tokens": 1919011608.0, + "step": 4003 + }, + { + "epoch": 2.376261127596439, + "grad_norm": 0.601531982421875, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7595045566558838, + "num_tokens": 1919438983.0, + "step": 4004 + }, + { + "epoch": 2.376854599406528, + "grad_norm": 0.5686721801757812, + "learning_rate": 1e-06, + "loss": 0.6698, + "mean_token_accuracy": 0.7849119901657104, + "num_tokens": 1919889558.0, + "step": 4005 + }, + { + "epoch": 2.3774480712166173, + "grad_norm": 0.5499917268753052, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7724687457084656, + "num_tokens": 1920442569.0, + "step": 4006 + }, + { + "epoch": 2.3780415430267063, + "grad_norm": 0.5717713832855225, + "learning_rate": 1e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.776020348072052, + "num_tokens": 1920905877.0, + "step": 4007 + }, + { + "epoch": 2.3786350148367954, + "grad_norm": 0.5676025748252869, + "learning_rate": 1e-06, + "loss": 0.6809, + "mean_token_accuracy": 0.7811645269393921, + "num_tokens": 1921356798.0, + "step": 4008 + }, + { + "epoch": 2.3792284866468845, + "grad_norm": 0.5777949094772339, + "learning_rate": 1e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.7794188261032104, + "num_tokens": 1921825748.0, + "step": 4009 + }, + { + "epoch": 2.379821958456973, + "grad_norm": 0.5448148250579834, + "learning_rate": 1e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7651576995849609, + "num_tokens": 1922350534.0, + "step": 4010 + }, + { + "epoch": 2.380415430267062, + "grad_norm": 0.5541651844978333, + "learning_rate": 1e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7670571804046631, + "num_tokens": 1922837273.0, + "step": 4011 + }, + { + "epoch": 2.381008902077151, + "grad_norm": 0.5269199013710022, + "learning_rate": 1e-06, + "loss": 0.688, + "mean_token_accuracy": 0.7798852920532227, + "num_tokens": 1923377096.0, + "step": 4012 + }, + { + "epoch": 2.3816023738872403, + "grad_norm": 0.5524513125419617, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7682085037231445, + "num_tokens": 1923847736.0, + "step": 4013 + }, + { + "epoch": 2.3821958456973293, + "grad_norm": 0.5339100360870361, + "learning_rate": 1e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.7797863483428955, + "num_tokens": 1924358773.0, + "step": 4014 + }, + { + "epoch": 2.3827893175074184, + "grad_norm": 0.5471411347389221, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7690204381942749, + "num_tokens": 1924834972.0, + "step": 4015 + }, + { + "epoch": 2.3833827893175075, + "grad_norm": 0.5746520757675171, + "learning_rate": 1e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7601677179336548, + "num_tokens": 1925322005.0, + "step": 4016 + }, + { + "epoch": 2.3839762611275965, + "grad_norm": 0.5559874773025513, + "learning_rate": 1e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.7772905826568604, + "num_tokens": 1925792539.0, + "step": 4017 + }, + { + "epoch": 2.3845697329376856, + "grad_norm": 0.5393399596214294, + "learning_rate": 1e-06, + "loss": 0.6907, + "mean_token_accuracy": 0.7774943113327026, + "num_tokens": 1926267791.0, + "step": 4018 + }, + { + "epoch": 2.3851632047477747, + "grad_norm": 0.5740556120872498, + "learning_rate": 1e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7763888239860535, + "num_tokens": 1926718734.0, + "step": 4019 + }, + { + "epoch": 2.3857566765578637, + "grad_norm": 0.5564551949501038, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7671258449554443, + "num_tokens": 1927212805.0, + "step": 4020 + }, + { + "epoch": 2.3863501483679523, + "grad_norm": 0.5894469618797302, + "learning_rate": 1e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7735878229141235, + "num_tokens": 1927668008.0, + "step": 4021 + }, + { + "epoch": 2.3869436201780414, + "grad_norm": 0.591250479221344, + "learning_rate": 1e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7708930969238281, + "num_tokens": 1928108415.0, + "step": 4022 + }, + { + "epoch": 2.3875370919881305, + "grad_norm": 0.5730209350585938, + "learning_rate": 1e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7745044231414795, + "num_tokens": 1928614588.0, + "step": 4023 + }, + { + "epoch": 2.3881305637982195, + "grad_norm": 0.5733233690261841, + "learning_rate": 1e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7803690433502197, + "num_tokens": 1929107172.0, + "step": 4024 + }, + { + "epoch": 2.3887240356083086, + "grad_norm": 0.5401624441146851, + "learning_rate": 1e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.76568603515625, + "num_tokens": 1929589668.0, + "step": 4025 + }, + { + "epoch": 2.3893175074183977, + "grad_norm": 0.5527912378311157, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.77109694480896, + "num_tokens": 1930023159.0, + "step": 4026 + }, + { + "epoch": 2.3899109792284867, + "grad_norm": 0.5742594599723816, + "learning_rate": 1e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7776573300361633, + "num_tokens": 1930515154.0, + "step": 4027 + }, + { + "epoch": 2.390504451038576, + "grad_norm": 0.5824849605560303, + "learning_rate": 1e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7686828970909119, + "num_tokens": 1930992379.0, + "step": 4028 + }, + { + "epoch": 2.391097922848665, + "grad_norm": 0.5677292943000793, + "learning_rate": 1e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.773825466632843, + "num_tokens": 1931442054.0, + "step": 4029 + }, + { + "epoch": 2.3916913946587535, + "grad_norm": 0.5720227956771851, + "learning_rate": 1e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7712039947509766, + "num_tokens": 1931935253.0, + "step": 4030 + }, + { + "epoch": 2.3922848664688425, + "grad_norm": 0.5739685893058777, + "learning_rate": 1e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7679393291473389, + "num_tokens": 1932352774.0, + "step": 4031 + }, + { + "epoch": 2.3928783382789316, + "grad_norm": 0.5624223351478577, + "learning_rate": 1e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.7800090312957764, + "num_tokens": 1932816267.0, + "step": 4032 + }, + { + "epoch": 2.3934718100890207, + "grad_norm": 0.5696190595626831, + "learning_rate": 1e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7607403993606567, + "num_tokens": 1933293244.0, + "step": 4033 + }, + { + "epoch": 2.3940652818991097, + "grad_norm": 0.5774266719818115, + "learning_rate": 1e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.7932803630828857, + "num_tokens": 1933758121.0, + "step": 4034 + }, + { + "epoch": 2.394658753709199, + "grad_norm": 0.5749768614768982, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7623136043548584, + "num_tokens": 1934229927.0, + "step": 4035 + }, + { + "epoch": 2.395252225519288, + "grad_norm": 0.5480430126190186, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7703924775123596, + "num_tokens": 1934706734.0, + "step": 4036 + }, + { + "epoch": 2.395845697329377, + "grad_norm": 0.5969294309616089, + "learning_rate": 1e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7724603414535522, + "num_tokens": 1935156390.0, + "step": 4037 + }, + { + "epoch": 2.396439169139466, + "grad_norm": 0.5492481589317322, + "learning_rate": 1e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7767761945724487, + "num_tokens": 1935638770.0, + "step": 4038 + }, + { + "epoch": 2.397032640949555, + "grad_norm": 0.5498949885368347, + "learning_rate": 1e-06, + "loss": 0.6838, + "mean_token_accuracy": 0.7780259847640991, + "num_tokens": 1936119832.0, + "step": 4039 + }, + { + "epoch": 2.397626112759644, + "grad_norm": 0.5425861477851868, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.766819953918457, + "num_tokens": 1936607314.0, + "step": 4040 + }, + { + "epoch": 2.398219584569733, + "grad_norm": 0.5588927268981934, + "learning_rate": 1e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7633935213088989, + "num_tokens": 1937084295.0, + "step": 4041 + }, + { + "epoch": 2.398813056379822, + "grad_norm": 0.5501899719238281, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7717204093933105, + "num_tokens": 1937584132.0, + "step": 4042 + }, + { + "epoch": 2.399406528189911, + "grad_norm": 0.5466634035110474, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7699705362319946, + "num_tokens": 1938077564.0, + "step": 4043 + }, + { + "epoch": 2.4, + "grad_norm": 0.5884194374084473, + "learning_rate": 1e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7698412537574768, + "num_tokens": 1938504629.0, + "step": 4044 + }, + { + "epoch": 2.400593471810089, + "grad_norm": 0.5569417476654053, + "learning_rate": 1e-06, + "loss": 0.6598, + "mean_token_accuracy": 0.786338210105896, + "num_tokens": 1939018375.0, + "step": 4045 + }, + { + "epoch": 2.401186943620178, + "grad_norm": 0.5870042443275452, + "learning_rate": 1e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7780413627624512, + "num_tokens": 1939452295.0, + "step": 4046 + }, + { + "epoch": 2.401780415430267, + "grad_norm": 0.5501320958137512, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7699395418167114, + "num_tokens": 1939946270.0, + "step": 4047 + }, + { + "epoch": 2.402373887240356, + "grad_norm": 0.5542975664138794, + "learning_rate": 1e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7727419137954712, + "num_tokens": 1940453287.0, + "step": 4048 + }, + { + "epoch": 2.4029673590504452, + "grad_norm": 0.5062658190727234, + "learning_rate": 1e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.7840060591697693, + "num_tokens": 1940949607.0, + "step": 4049 + }, + { + "epoch": 2.4035608308605343, + "grad_norm": 0.5586448311805725, + "learning_rate": 1e-06, + "loss": 0.6709, + "mean_token_accuracy": 0.7850003242492676, + "num_tokens": 1941382015.0, + "step": 4050 + }, + { + "epoch": 2.404154302670623, + "grad_norm": 0.613767683506012, + "learning_rate": 1e-06, + "loss": 0.6688, + "mean_token_accuracy": 0.7866201400756836, + "num_tokens": 1941847518.0, + "step": 4051 + }, + { + "epoch": 2.404747774480712, + "grad_norm": 0.5577391982078552, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7759488821029663, + "num_tokens": 1942331280.0, + "step": 4052 + }, + { + "epoch": 2.405341246290801, + "grad_norm": 0.5696035027503967, + "learning_rate": 1e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7645002603530884, + "num_tokens": 1942799970.0, + "step": 4053 + }, + { + "epoch": 2.40593471810089, + "grad_norm": 0.5424214005470276, + "learning_rate": 1e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7853707075119019, + "num_tokens": 1943310402.0, + "step": 4054 + }, + { + "epoch": 2.406528189910979, + "grad_norm": 0.556891143321991, + "learning_rate": 1e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7758685350418091, + "num_tokens": 1943770025.0, + "step": 4055 + }, + { + "epoch": 2.4071216617210682, + "grad_norm": 0.5408994555473328, + "learning_rate": 1e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.766588568687439, + "num_tokens": 1944262236.0, + "step": 4056 + }, + { + "epoch": 2.4077151335311573, + "grad_norm": 0.5708072781562805, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7661044597625732, + "num_tokens": 1944707287.0, + "step": 4057 + }, + { + "epoch": 2.4083086053412464, + "grad_norm": 0.5661405324935913, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7653571367263794, + "num_tokens": 1945185192.0, + "step": 4058 + }, + { + "epoch": 2.4089020771513354, + "grad_norm": 0.5715323686599731, + "learning_rate": 1e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7758784890174866, + "num_tokens": 1945685317.0, + "step": 4059 + }, + { + "epoch": 2.4094955489614245, + "grad_norm": 0.5973775386810303, + "learning_rate": 1e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.7627360820770264, + "num_tokens": 1946173140.0, + "step": 4060 + }, + { + "epoch": 2.4100890207715135, + "grad_norm": 0.5421015024185181, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7675138115882874, + "num_tokens": 1946630865.0, + "step": 4061 + }, + { + "epoch": 2.4106824925816026, + "grad_norm": 0.5363019704818726, + "learning_rate": 1e-06, + "loss": 0.7, + "mean_token_accuracy": 0.7758080959320068, + "num_tokens": 1947101840.0, + "step": 4062 + }, + { + "epoch": 2.4112759643916912, + "grad_norm": 0.5911299586296082, + "learning_rate": 1e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7733092904090881, + "num_tokens": 1947556566.0, + "step": 4063 + }, + { + "epoch": 2.4118694362017803, + "grad_norm": 0.5621746778488159, + "learning_rate": 1e-06, + "loss": 0.7095, + "mean_token_accuracy": 0.7750130891799927, + "num_tokens": 1948041400.0, + "step": 4064 + }, + { + "epoch": 2.4124629080118694, + "grad_norm": 0.5450804829597473, + "learning_rate": 1e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.7834550142288208, + "num_tokens": 1948504369.0, + "step": 4065 + }, + { + "epoch": 2.4130563798219584, + "grad_norm": 0.5557975769042969, + "learning_rate": 1e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7795486450195312, + "num_tokens": 1948978725.0, + "step": 4066 + }, + { + "epoch": 2.4136498516320475, + "grad_norm": 0.5774551033973694, + "learning_rate": 1e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.7812241315841675, + "num_tokens": 1949409734.0, + "step": 4067 + }, + { + "epoch": 2.4142433234421365, + "grad_norm": 0.5472142696380615, + "learning_rate": 1e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7695488929748535, + "num_tokens": 1949892358.0, + "step": 4068 + }, + { + "epoch": 2.4148367952522256, + "grad_norm": 0.562029242515564, + "learning_rate": 1e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.7578261494636536, + "num_tokens": 1950363862.0, + "step": 4069 + }, + { + "epoch": 2.4154302670623147, + "grad_norm": 0.5840917229652405, + "learning_rate": 1e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7716116309165955, + "num_tokens": 1950815786.0, + "step": 4070 + }, + { + "epoch": 2.4160237388724037, + "grad_norm": 0.5421916246414185, + "learning_rate": 1e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7700906991958618, + "num_tokens": 1951283455.0, + "step": 4071 + }, + { + "epoch": 2.4166172106824924, + "grad_norm": 0.5467656850814819, + "learning_rate": 1e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7710453271865845, + "num_tokens": 1951757595.0, + "step": 4072 + }, + { + "epoch": 2.4172106824925814, + "grad_norm": 0.601810872554779, + "learning_rate": 1e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.770483672618866, + "num_tokens": 1952222983.0, + "step": 4073 + }, + { + "epoch": 2.4178041543026705, + "grad_norm": 0.5755776762962341, + "learning_rate": 1e-06, + "loss": 0.6542, + "mean_token_accuracy": 0.7886855602264404, + "num_tokens": 1952679919.0, + "step": 4074 + }, + { + "epoch": 2.4183976261127595, + "grad_norm": 0.5547249913215637, + "learning_rate": 1e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7701570987701416, + "num_tokens": 1953128372.0, + "step": 4075 + }, + { + "epoch": 2.4189910979228486, + "grad_norm": 0.5827676653862, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7659159898757935, + "num_tokens": 1953584450.0, + "step": 4076 + }, + { + "epoch": 2.4195845697329377, + "grad_norm": 0.5780925750732422, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7704225182533264, + "num_tokens": 1954103636.0, + "step": 4077 + }, + { + "epoch": 2.4201780415430267, + "grad_norm": 0.554783284664154, + "learning_rate": 1e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.7800227403640747, + "num_tokens": 1954560491.0, + "step": 4078 + }, + { + "epoch": 2.420771513353116, + "grad_norm": 0.5747748613357544, + "learning_rate": 1e-06, + "loss": 0.6739, + "mean_token_accuracy": 0.7844441533088684, + "num_tokens": 1955027612.0, + "step": 4079 + }, + { + "epoch": 2.421364985163205, + "grad_norm": 0.5765325427055359, + "learning_rate": 1e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7620021104812622, + "num_tokens": 1955464525.0, + "step": 4080 + }, + { + "epoch": 2.421958456973294, + "grad_norm": 0.5749925374984741, + "learning_rate": 1e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.7726967334747314, + "num_tokens": 1955928897.0, + "step": 4081 + }, + { + "epoch": 2.422551928783383, + "grad_norm": 0.573282778263092, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7627062797546387, + "num_tokens": 1956378801.0, + "step": 4082 + }, + { + "epoch": 2.423145400593472, + "grad_norm": 0.5720486640930176, + "learning_rate": 1e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.774420976638794, + "num_tokens": 1956841860.0, + "step": 4083 + }, + { + "epoch": 2.4237388724035607, + "grad_norm": 0.5433860421180725, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7631203532218933, + "num_tokens": 1957305062.0, + "step": 4084 + }, + { + "epoch": 2.4243323442136497, + "grad_norm": 0.5326481461524963, + "learning_rate": 1e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7700223326683044, + "num_tokens": 1957771846.0, + "step": 4085 + }, + { + "epoch": 2.424925816023739, + "grad_norm": 0.5424807071685791, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.765975832939148, + "num_tokens": 1958271860.0, + "step": 4086 + }, + { + "epoch": 2.425519287833828, + "grad_norm": 0.5798866152763367, + "learning_rate": 1e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7652876377105713, + "num_tokens": 1958752005.0, + "step": 4087 + }, + { + "epoch": 2.426112759643917, + "grad_norm": 0.5809770822525024, + "learning_rate": 1e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7609305381774902, + "num_tokens": 1959209475.0, + "step": 4088 + }, + { + "epoch": 2.426706231454006, + "grad_norm": 0.5726100206375122, + "learning_rate": 1e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7563595771789551, + "num_tokens": 1959702060.0, + "step": 4089 + }, + { + "epoch": 2.427299703264095, + "grad_norm": 0.607688307762146, + "learning_rate": 1e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.766123354434967, + "num_tokens": 1960159184.0, + "step": 4090 + }, + { + "epoch": 2.427893175074184, + "grad_norm": 0.547916054725647, + "learning_rate": 1e-06, + "loss": 0.6843, + "mean_token_accuracy": 0.780815064907074, + "num_tokens": 1960634306.0, + "step": 4091 + }, + { + "epoch": 2.428486646884273, + "grad_norm": 0.5531718134880066, + "learning_rate": 1e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7697163224220276, + "num_tokens": 1961124265.0, + "step": 4092 + }, + { + "epoch": 2.429080118694362, + "grad_norm": 0.5795264840126038, + "learning_rate": 1e-06, + "loss": 0.6799, + "mean_token_accuracy": 0.781214714050293, + "num_tokens": 1961606884.0, + "step": 4093 + }, + { + "epoch": 2.429673590504451, + "grad_norm": 0.5652590990066528, + "learning_rate": 1e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7700773477554321, + "num_tokens": 1962111830.0, + "step": 4094 + }, + { + "epoch": 2.43026706231454, + "grad_norm": 0.5639579892158508, + "learning_rate": 1e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7695596814155579, + "num_tokens": 1962600406.0, + "step": 4095 + }, + { + "epoch": 2.430860534124629, + "grad_norm": 0.552727222442627, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7590367794036865, + "num_tokens": 1963110149.0, + "step": 4096 + }, + { + "epoch": 2.431454005934718, + "grad_norm": 0.586633026599884, + "learning_rate": 1e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7675172686576843, + "num_tokens": 1963577423.0, + "step": 4097 + }, + { + "epoch": 2.432047477744807, + "grad_norm": 0.5312545299530029, + "learning_rate": 1e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.7782641649246216, + "num_tokens": 1964073214.0, + "step": 4098 + }, + { + "epoch": 2.432640949554896, + "grad_norm": 0.5193890333175659, + "learning_rate": 1e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7624539136886597, + "num_tokens": 1964601458.0, + "step": 4099 + }, + { + "epoch": 2.4332344213649852, + "grad_norm": 0.5600568056106567, + "learning_rate": 1e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7704953551292419, + "num_tokens": 1965132813.0, + "step": 4100 + }, + { + "epoch": 2.4338278931750743, + "grad_norm": 0.573651909828186, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7684050798416138, + "num_tokens": 1965608044.0, + "step": 4101 + }, + { + "epoch": 2.4344213649851634, + "grad_norm": 0.5249114036560059, + "learning_rate": 1e-06, + "loss": 0.6608, + "mean_token_accuracy": 0.7869603633880615, + "num_tokens": 1966080925.0, + "step": 4102 + }, + { + "epoch": 2.4350148367952524, + "grad_norm": 0.5338755249977112, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7734152674674988, + "num_tokens": 1966576618.0, + "step": 4103 + }, + { + "epoch": 2.435608308605341, + "grad_norm": 0.5478979349136353, + "learning_rate": 1e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.7880788445472717, + "num_tokens": 1967029714.0, + "step": 4104 + }, + { + "epoch": 2.43620178041543, + "grad_norm": 0.5165589451789856, + "learning_rate": 1e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7704932689666748, + "num_tokens": 1967564116.0, + "step": 4105 + }, + { + "epoch": 2.436795252225519, + "grad_norm": 0.5487171411514282, + "learning_rate": 1e-06, + "loss": 0.661, + "mean_token_accuracy": 0.7846568822860718, + "num_tokens": 1968010048.0, + "step": 4106 + }, + { + "epoch": 2.4373887240356082, + "grad_norm": 0.5401192903518677, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7620213031768799, + "num_tokens": 1968508338.0, + "step": 4107 + }, + { + "epoch": 2.4379821958456973, + "grad_norm": 0.5456669330596924, + "learning_rate": 1e-06, + "loss": 0.768, + "mean_token_accuracy": 0.7590921521186829, + "num_tokens": 1968988429.0, + "step": 4108 + }, + { + "epoch": 2.4385756676557864, + "grad_norm": 0.5557281374931335, + "learning_rate": 1e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7728614807128906, + "num_tokens": 1969453785.0, + "step": 4109 + }, + { + "epoch": 2.4391691394658754, + "grad_norm": 0.5559225082397461, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7735576629638672, + "num_tokens": 1969919502.0, + "step": 4110 + }, + { + "epoch": 2.4397626112759645, + "grad_norm": 0.5800442695617676, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7647867202758789, + "num_tokens": 1970367595.0, + "step": 4111 + }, + { + "epoch": 2.4403560830860536, + "grad_norm": 0.5272473692893982, + "learning_rate": 1e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.7540454864501953, + "num_tokens": 1970874390.0, + "step": 4112 + }, + { + "epoch": 2.4409495548961426, + "grad_norm": 0.5577599406242371, + "learning_rate": 1e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7702188491821289, + "num_tokens": 1971331187.0, + "step": 4113 + }, + { + "epoch": 2.4415430267062312, + "grad_norm": 0.560893177986145, + "learning_rate": 1e-06, + "loss": 0.6598, + "mean_token_accuracy": 0.7848126888275146, + "num_tokens": 1971795828.0, + "step": 4114 + }, + { + "epoch": 2.4421364985163203, + "grad_norm": 0.5662849545478821, + "learning_rate": 1e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.7801162004470825, + "num_tokens": 1972253677.0, + "step": 4115 + }, + { + "epoch": 2.4427299703264094, + "grad_norm": 0.5674921870231628, + "learning_rate": 1e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7580692768096924, + "num_tokens": 1972732308.0, + "step": 4116 + }, + { + "epoch": 2.4433234421364984, + "grad_norm": 0.5435652732849121, + "learning_rate": 1e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7726103067398071, + "num_tokens": 1973214838.0, + "step": 4117 + }, + { + "epoch": 2.4439169139465875, + "grad_norm": 0.5421527028083801, + "learning_rate": 1e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7672739028930664, + "num_tokens": 1973703527.0, + "step": 4118 + }, + { + "epoch": 2.4445103857566766, + "grad_norm": 0.599586546421051, + "learning_rate": 1e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7673457264900208, + "num_tokens": 1974127858.0, + "step": 4119 + }, + { + "epoch": 2.4451038575667656, + "grad_norm": 0.5249542593955994, + "learning_rate": 1e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7816287875175476, + "num_tokens": 1974624340.0, + "step": 4120 + }, + { + "epoch": 2.4456973293768547, + "grad_norm": 0.5232545137405396, + "learning_rate": 1e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7784467935562134, + "num_tokens": 1975150373.0, + "step": 4121 + }, + { + "epoch": 2.4462908011869438, + "grad_norm": 0.5490128397941589, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7579424381256104, + "num_tokens": 1975614230.0, + "step": 4122 + }, + { + "epoch": 2.446884272997033, + "grad_norm": 0.5303844809532166, + "learning_rate": 1e-06, + "loss": 0.6659, + "mean_token_accuracy": 0.7857566475868225, + "num_tokens": 1976091158.0, + "step": 4123 + }, + { + "epoch": 2.447477744807122, + "grad_norm": 0.5732218623161316, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.764484167098999, + "num_tokens": 1976551170.0, + "step": 4124 + }, + { + "epoch": 2.4480712166172105, + "grad_norm": 0.5334784388542175, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7604075074195862, + "num_tokens": 1977024262.0, + "step": 4125 + }, + { + "epoch": 2.4486646884272996, + "grad_norm": 0.5479231476783752, + "learning_rate": 1e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.7747380137443542, + "num_tokens": 1977520492.0, + "step": 4126 + }, + { + "epoch": 2.4492581602373886, + "grad_norm": 0.5485187768936157, + "learning_rate": 1e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.7781572937965393, + "num_tokens": 1978003654.0, + "step": 4127 + }, + { + "epoch": 2.4498516320474777, + "grad_norm": 0.5782300233840942, + "learning_rate": 1e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7710486650466919, + "num_tokens": 1978418160.0, + "step": 4128 + }, + { + "epoch": 2.4504451038575668, + "grad_norm": 0.5419113039970398, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7688227891921997, + "num_tokens": 1978887326.0, + "step": 4129 + }, + { + "epoch": 2.451038575667656, + "grad_norm": 0.5474616885185242, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.755403995513916, + "num_tokens": 1979373350.0, + "step": 4130 + }, + { + "epoch": 2.451632047477745, + "grad_norm": 0.5439400672912598, + "learning_rate": 1e-06, + "loss": 0.7007, + "mean_token_accuracy": 0.7767373323440552, + "num_tokens": 1979871010.0, + "step": 4131 + }, + { + "epoch": 2.452225519287834, + "grad_norm": 0.5406590700149536, + "learning_rate": 1e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.7795279026031494, + "num_tokens": 1980380200.0, + "step": 4132 + }, + { + "epoch": 2.452818991097923, + "grad_norm": 0.5409836173057556, + "learning_rate": 1e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7702890038490295, + "num_tokens": 1980895365.0, + "step": 4133 + }, + { + "epoch": 2.4534124629080116, + "grad_norm": 0.5731067657470703, + "learning_rate": 1e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7761125564575195, + "num_tokens": 1981346216.0, + "step": 4134 + }, + { + "epoch": 2.4540059347181007, + "grad_norm": 0.5728688836097717, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7629432678222656, + "num_tokens": 1981814973.0, + "step": 4135 + }, + { + "epoch": 2.4545994065281898, + "grad_norm": 0.5351424813270569, + "learning_rate": 1e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7641043663024902, + "num_tokens": 1982348119.0, + "step": 4136 + }, + { + "epoch": 2.455192878338279, + "grad_norm": 0.573769748210907, + "learning_rate": 1e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7548701763153076, + "num_tokens": 1982821609.0, + "step": 4137 + }, + { + "epoch": 2.455786350148368, + "grad_norm": 0.5547954440116882, + "learning_rate": 1e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.7774819731712341, + "num_tokens": 1983285284.0, + "step": 4138 + }, + { + "epoch": 2.456379821958457, + "grad_norm": 0.5603556632995605, + "learning_rate": 1e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.7831161618232727, + "num_tokens": 1983740454.0, + "step": 4139 + }, + { + "epoch": 2.456973293768546, + "grad_norm": 0.5576350092887878, + "learning_rate": 1e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.7796205282211304, + "num_tokens": 1984209847.0, + "step": 4140 + }, + { + "epoch": 2.457566765578635, + "grad_norm": 0.5769627094268799, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7608567476272583, + "num_tokens": 1984655215.0, + "step": 4141 + }, + { + "epoch": 2.458160237388724, + "grad_norm": 0.5128679275512695, + "learning_rate": 1e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.7831470966339111, + "num_tokens": 1985174568.0, + "step": 4142 + }, + { + "epoch": 2.458753709198813, + "grad_norm": 0.5515980124473572, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7668840289115906, + "num_tokens": 1985643678.0, + "step": 4143 + }, + { + "epoch": 2.4593471810089023, + "grad_norm": 0.5208150148391724, + "learning_rate": 1e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7847336530685425, + "num_tokens": 1986137301.0, + "step": 4144 + }, + { + "epoch": 2.4599406528189913, + "grad_norm": 0.5571316480636597, + "learning_rate": 1e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7637666463851929, + "num_tokens": 1986650954.0, + "step": 4145 + }, + { + "epoch": 2.46053412462908, + "grad_norm": 0.5688709020614624, + "learning_rate": 1e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7702897787094116, + "num_tokens": 1987098431.0, + "step": 4146 + }, + { + "epoch": 2.461127596439169, + "grad_norm": 0.5970539450645447, + "learning_rate": 1e-06, + "loss": 0.696, + "mean_token_accuracy": 0.7773655652999878, + "num_tokens": 1987560313.0, + "step": 4147 + }, + { + "epoch": 2.461721068249258, + "grad_norm": 0.5334010720252991, + "learning_rate": 1e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7735573053359985, + "num_tokens": 1988056608.0, + "step": 4148 + }, + { + "epoch": 2.462314540059347, + "grad_norm": 0.5286883115768433, + "learning_rate": 1e-06, + "loss": 0.6674, + "mean_token_accuracy": 0.78230881690979, + "num_tokens": 1988573100.0, + "step": 4149 + }, + { + "epoch": 2.462908011869436, + "grad_norm": 0.5395922660827637, + "learning_rate": 1e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7707173824310303, + "num_tokens": 1989103580.0, + "step": 4150 + }, + { + "epoch": 2.4635014836795253, + "grad_norm": 0.5569303035736084, + "learning_rate": 1e-06, + "loss": 0.678, + "mean_token_accuracy": 0.7819317579269409, + "num_tokens": 1989589002.0, + "step": 4151 + }, + { + "epoch": 2.4640949554896143, + "grad_norm": 0.5580945014953613, + "learning_rate": 1e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.775477409362793, + "num_tokens": 1990036707.0, + "step": 4152 + }, + { + "epoch": 2.4646884272997034, + "grad_norm": 0.5565522313117981, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7734628915786743, + "num_tokens": 1990478528.0, + "step": 4153 + }, + { + "epoch": 2.4652818991097925, + "grad_norm": 0.5686991214752197, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7681543827056885, + "num_tokens": 1990947194.0, + "step": 4154 + }, + { + "epoch": 2.465875370919881, + "grad_norm": 0.5687745213508606, + "learning_rate": 1e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7807050943374634, + "num_tokens": 1991372924.0, + "step": 4155 + }, + { + "epoch": 2.46646884272997, + "grad_norm": 0.5340123176574707, + "learning_rate": 1e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7736976742744446, + "num_tokens": 1991862695.0, + "step": 4156 + }, + { + "epoch": 2.467062314540059, + "grad_norm": 0.5781863927841187, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7670531272888184, + "num_tokens": 1992352819.0, + "step": 4157 + }, + { + "epoch": 2.4676557863501483, + "grad_norm": 0.5534612536430359, + "learning_rate": 1e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7690058350563049, + "num_tokens": 1992843257.0, + "step": 4158 + }, + { + "epoch": 2.4682492581602373, + "grad_norm": 0.5196546912193298, + "learning_rate": 1e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7607842683792114, + "num_tokens": 1993331692.0, + "step": 4159 + }, + { + "epoch": 2.4688427299703264, + "grad_norm": 0.5379504561424255, + "learning_rate": 1e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7780094742774963, + "num_tokens": 1993803338.0, + "step": 4160 + }, + { + "epoch": 2.4694362017804155, + "grad_norm": 0.5677748918533325, + "learning_rate": 1e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7746113538742065, + "num_tokens": 1994275467.0, + "step": 4161 + }, + { + "epoch": 2.4700296735905045, + "grad_norm": 0.5963862538337708, + "learning_rate": 1e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.778276264667511, + "num_tokens": 1994691313.0, + "step": 4162 + }, + { + "epoch": 2.4706231454005936, + "grad_norm": 0.5230413675308228, + "learning_rate": 1e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7802920341491699, + "num_tokens": 1995168780.0, + "step": 4163 + }, + { + "epoch": 2.4712166172106826, + "grad_norm": 0.5442497134208679, + "learning_rate": 1e-06, + "loss": 0.707, + "mean_token_accuracy": 0.7742982506752014, + "num_tokens": 1995624567.0, + "step": 4164 + }, + { + "epoch": 2.4718100890207717, + "grad_norm": 0.5504491925239563, + "learning_rate": 1e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.7876421213150024, + "num_tokens": 1996124337.0, + "step": 4165 + }, + { + "epoch": 2.4724035608308608, + "grad_norm": 0.6050947904586792, + "learning_rate": 1e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.77210533618927, + "num_tokens": 1996564782.0, + "step": 4166 + }, + { + "epoch": 2.4729970326409494, + "grad_norm": 0.5450499653816223, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7616167068481445, + "num_tokens": 1997080220.0, + "step": 4167 + }, + { + "epoch": 2.4735905044510385, + "grad_norm": 0.5978739261627197, + "learning_rate": 1e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7758287191390991, + "num_tokens": 1997533847.0, + "step": 4168 + }, + { + "epoch": 2.4741839762611275, + "grad_norm": 0.5615165829658508, + "learning_rate": 1e-06, + "loss": 0.6761, + "mean_token_accuracy": 0.784111499786377, + "num_tokens": 1998048217.0, + "step": 4169 + }, + { + "epoch": 2.4747774480712166, + "grad_norm": 0.52744060754776, + "learning_rate": 1e-06, + "loss": 0.6545, + "mean_token_accuracy": 0.7881430387496948, + "num_tokens": 1998557406.0, + "step": 4170 + }, + { + "epoch": 2.4753709198813056, + "grad_norm": 0.5622207522392273, + "learning_rate": 1e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7578321695327759, + "num_tokens": 1998986376.0, + "step": 4171 + }, + { + "epoch": 2.4759643916913947, + "grad_norm": 0.587618350982666, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7625371813774109, + "num_tokens": 1999421993.0, + "step": 4172 + }, + { + "epoch": 2.4765578635014838, + "grad_norm": 0.5860242247581482, + "learning_rate": 1e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.7746689915657043, + "num_tokens": 1999899491.0, + "step": 4173 + }, + { + "epoch": 2.477151335311573, + "grad_norm": 0.5643093585968018, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7574833631515503, + "num_tokens": 2000354888.0, + "step": 4174 + }, + { + "epoch": 2.477744807121662, + "grad_norm": 0.5238267183303833, + "learning_rate": 1e-06, + "loss": 0.6823, + "mean_token_accuracy": 0.7813748121261597, + "num_tokens": 2000858173.0, + "step": 4175 + }, + { + "epoch": 2.4783382789317505, + "grad_norm": 0.5625728964805603, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7497003674507141, + "num_tokens": 2001340962.0, + "step": 4176 + }, + { + "epoch": 2.4789317507418396, + "grad_norm": 0.5953350067138672, + "learning_rate": 1e-06, + "loss": 0.6788, + "mean_token_accuracy": 0.781630277633667, + "num_tokens": 2001812341.0, + "step": 4177 + }, + { + "epoch": 2.4795252225519286, + "grad_norm": 0.5584080815315247, + "learning_rate": 1e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.756119430065155, + "num_tokens": 2002266716.0, + "step": 4178 + }, + { + "epoch": 2.4801186943620177, + "grad_norm": 0.5519623160362244, + "learning_rate": 1e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7646304368972778, + "num_tokens": 2002746640.0, + "step": 4179 + }, + { + "epoch": 2.4807121661721068, + "grad_norm": 0.5546639561653137, + "learning_rate": 1e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7661546468734741, + "num_tokens": 2003224772.0, + "step": 4180 + }, + { + "epoch": 2.481305637982196, + "grad_norm": 0.5381820201873779, + "learning_rate": 1e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7722578644752502, + "num_tokens": 2003699622.0, + "step": 4181 + }, + { + "epoch": 2.481899109792285, + "grad_norm": 0.5481154322624207, + "learning_rate": 1e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7608296871185303, + "num_tokens": 2004194573.0, + "step": 4182 + }, + { + "epoch": 2.482492581602374, + "grad_norm": 0.532283365726471, + "learning_rate": 1e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7599940299987793, + "num_tokens": 2004682465.0, + "step": 4183 + }, + { + "epoch": 2.483086053412463, + "grad_norm": 0.5611511468887329, + "learning_rate": 1e-06, + "loss": 0.7, + "mean_token_accuracy": 0.7739758491516113, + "num_tokens": 2005136968.0, + "step": 4184 + }, + { + "epoch": 2.483679525222552, + "grad_norm": 0.5634406208992004, + "learning_rate": 1e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7650247812271118, + "num_tokens": 2005634908.0, + "step": 4185 + }, + { + "epoch": 2.484272997032641, + "grad_norm": 0.5749930739402771, + "learning_rate": 1e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7669577598571777, + "num_tokens": 2006110335.0, + "step": 4186 + }, + { + "epoch": 2.48486646884273, + "grad_norm": 0.554132342338562, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7704758048057556, + "num_tokens": 2006586455.0, + "step": 4187 + }, + { + "epoch": 2.485459940652819, + "grad_norm": 0.5783421397209167, + "learning_rate": 1e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7728346586227417, + "num_tokens": 2006993980.0, + "step": 4188 + }, + { + "epoch": 2.486053412462908, + "grad_norm": 0.5504971146583557, + "learning_rate": 1e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.7816401124000549, + "num_tokens": 2007503104.0, + "step": 4189 + }, + { + "epoch": 2.486646884272997, + "grad_norm": 0.542829692363739, + "learning_rate": 1e-06, + "loss": 0.6927, + "mean_token_accuracy": 0.7797069549560547, + "num_tokens": 2007984480.0, + "step": 4190 + }, + { + "epoch": 2.487240356083086, + "grad_norm": 0.58444744348526, + "learning_rate": 1e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7673267126083374, + "num_tokens": 2008442331.0, + "step": 4191 + }, + { + "epoch": 2.487833827893175, + "grad_norm": 0.5639715790748596, + "learning_rate": 1e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7729182243347168, + "num_tokens": 2008943465.0, + "step": 4192 + }, + { + "epoch": 2.488427299703264, + "grad_norm": 0.5628265142440796, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7691687345504761, + "num_tokens": 2009400639.0, + "step": 4193 + }, + { + "epoch": 2.489020771513353, + "grad_norm": 0.5608643293380737, + "learning_rate": 1e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7635531425476074, + "num_tokens": 2009859481.0, + "step": 4194 + }, + { + "epoch": 2.4896142433234423, + "grad_norm": 0.5739902853965759, + "learning_rate": 1e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.773398756980896, + "num_tokens": 2010306673.0, + "step": 4195 + }, + { + "epoch": 2.4902077151335313, + "grad_norm": 0.577923059463501, + "learning_rate": 1e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.7727123498916626, + "num_tokens": 2010742398.0, + "step": 4196 + }, + { + "epoch": 2.49080118694362, + "grad_norm": 0.5506657361984253, + "learning_rate": 1e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7668822407722473, + "num_tokens": 2011210360.0, + "step": 4197 + }, + { + "epoch": 2.491394658753709, + "grad_norm": 0.5371988415718079, + "learning_rate": 1e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7705531120300293, + "num_tokens": 2011716104.0, + "step": 4198 + }, + { + "epoch": 2.491988130563798, + "grad_norm": 0.5558494925498962, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7685506343841553, + "num_tokens": 2012221793.0, + "step": 4199 + }, + { + "epoch": 2.492581602373887, + "grad_norm": 0.53468918800354, + "learning_rate": 1e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7737194895744324, + "num_tokens": 2012775522.0, + "step": 4200 + }, + { + "epoch": 2.493175074183976, + "grad_norm": 0.5415019989013672, + "learning_rate": 1e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.7670275568962097, + "num_tokens": 2013251637.0, + "step": 4201 + }, + { + "epoch": 2.4937685459940653, + "grad_norm": 0.5520800948143005, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7608491778373718, + "num_tokens": 2013731238.0, + "step": 4202 + }, + { + "epoch": 2.4943620178041543, + "grad_norm": 0.5419222712516785, + "learning_rate": 1e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7601563930511475, + "num_tokens": 2014265902.0, + "step": 4203 + }, + { + "epoch": 2.4949554896142434, + "grad_norm": 0.5412127375602722, + "learning_rate": 1e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7764028310775757, + "num_tokens": 2014758471.0, + "step": 4204 + }, + { + "epoch": 2.4955489614243325, + "grad_norm": 0.542952299118042, + "learning_rate": 1e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7780414819717407, + "num_tokens": 2015217839.0, + "step": 4205 + }, + { + "epoch": 2.4961424332344215, + "grad_norm": 0.542019784450531, + "learning_rate": 1e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7785173654556274, + "num_tokens": 2015723230.0, + "step": 4206 + }, + { + "epoch": 2.4967359050445106, + "grad_norm": 0.5603153109550476, + "learning_rate": 1e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7709858417510986, + "num_tokens": 2016193969.0, + "step": 4207 + }, + { + "epoch": 2.497329376854599, + "grad_norm": 0.5544591546058655, + "learning_rate": 1e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7746371030807495, + "num_tokens": 2016672027.0, + "step": 4208 + }, + { + "epoch": 2.4979228486646883, + "grad_norm": 0.5245561003684998, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.767513632774353, + "num_tokens": 2017177535.0, + "step": 4209 + }, + { + "epoch": 2.4985163204747773, + "grad_norm": 0.5493699908256531, + "learning_rate": 1e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7765582799911499, + "num_tokens": 2017692260.0, + "step": 4210 + }, + { + "epoch": 2.4991097922848664, + "grad_norm": 0.5153082609176636, + "learning_rate": 1e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7759761214256287, + "num_tokens": 2018220474.0, + "step": 4211 + }, + { + "epoch": 2.4997032640949555, + "grad_norm": 0.559798538684845, + "learning_rate": 1e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7561721801757812, + "num_tokens": 2018664065.0, + "step": 4212 + }, + { + "epoch": 2.5002967359050445, + "grad_norm": 0.5564494729042053, + "learning_rate": 1e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7565016746520996, + "num_tokens": 2019141878.0, + "step": 4213 + }, + { + "epoch": 2.5008902077151336, + "grad_norm": 0.5634084343910217, + "learning_rate": 1e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7631946206092834, + "num_tokens": 2019590281.0, + "step": 4214 + }, + { + "epoch": 2.5014836795252227, + "grad_norm": 0.5501988530158997, + "learning_rate": 1e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.7791361212730408, + "num_tokens": 2020087731.0, + "step": 4215 + }, + { + "epoch": 2.5020771513353117, + "grad_norm": 0.552406907081604, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7509201765060425, + "num_tokens": 2020570383.0, + "step": 4216 + }, + { + "epoch": 2.5026706231454003, + "grad_norm": 0.5643271803855896, + "learning_rate": 1e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.770788311958313, + "num_tokens": 2021023722.0, + "step": 4217 + }, + { + "epoch": 2.5032640949554894, + "grad_norm": 0.5315454006195068, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7632404565811157, + "num_tokens": 2021557398.0, + "step": 4218 + }, + { + "epoch": 2.5038575667655785, + "grad_norm": 0.5244895815849304, + "learning_rate": 1e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7710616588592529, + "num_tokens": 2022038614.0, + "step": 4219 + }, + { + "epoch": 2.5044510385756675, + "grad_norm": 0.5566268563270569, + "learning_rate": 1e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7514801025390625, + "num_tokens": 2022520191.0, + "step": 4220 + }, + { + "epoch": 2.5050445103857566, + "grad_norm": 0.5301620364189148, + "learning_rate": 1e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.7796012163162231, + "num_tokens": 2023012789.0, + "step": 4221 + }, + { + "epoch": 2.5056379821958457, + "grad_norm": 0.5205740332603455, + "learning_rate": 1e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7662057876586914, + "num_tokens": 2023540403.0, + "step": 4222 + }, + { + "epoch": 2.5062314540059347, + "grad_norm": 0.574079155921936, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7634248733520508, + "num_tokens": 2024001297.0, + "step": 4223 + }, + { + "epoch": 2.506824925816024, + "grad_norm": 0.5658441185951233, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7641452550888062, + "num_tokens": 2024473806.0, + "step": 4224 + }, + { + "epoch": 2.507418397626113, + "grad_norm": 0.5706830024719238, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7684122920036316, + "num_tokens": 2024901753.0, + "step": 4225 + }, + { + "epoch": 2.508011869436202, + "grad_norm": 0.6071184873580933, + "learning_rate": 1e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.774254322052002, + "num_tokens": 2025359148.0, + "step": 4226 + }, + { + "epoch": 2.508605341246291, + "grad_norm": 0.554657518863678, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7659263610839844, + "num_tokens": 2025802834.0, + "step": 4227 + }, + { + "epoch": 2.50919881305638, + "grad_norm": 0.5749792456626892, + "learning_rate": 1e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.7702358365058899, + "num_tokens": 2026270978.0, + "step": 4228 + }, + { + "epoch": 2.509792284866469, + "grad_norm": 0.5129081010818481, + "learning_rate": 1e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7674124240875244, + "num_tokens": 2026810812.0, + "step": 4229 + }, + { + "epoch": 2.5103857566765577, + "grad_norm": 0.5794824361801147, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7602280974388123, + "num_tokens": 2027266858.0, + "step": 4230 + }, + { + "epoch": 2.510979228486647, + "grad_norm": 0.5702782869338989, + "learning_rate": 1e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7706320881843567, + "num_tokens": 2027717791.0, + "step": 4231 + }, + { + "epoch": 2.511572700296736, + "grad_norm": 0.5993922352790833, + "learning_rate": 1e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7720564603805542, + "num_tokens": 2028171453.0, + "step": 4232 + }, + { + "epoch": 2.512166172106825, + "grad_norm": 0.5361097455024719, + "learning_rate": 1e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7565966844558716, + "num_tokens": 2028654484.0, + "step": 4233 + }, + { + "epoch": 2.512759643916914, + "grad_norm": 0.5352274179458618, + "learning_rate": 1e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7712813019752502, + "num_tokens": 2029153572.0, + "step": 4234 + }, + { + "epoch": 2.513353115727003, + "grad_norm": 0.5600447058677673, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7701349258422852, + "num_tokens": 2029588032.0, + "step": 4235 + }, + { + "epoch": 2.513946587537092, + "grad_norm": 0.5498859882354736, + "learning_rate": 1e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7701889872550964, + "num_tokens": 2030077272.0, + "step": 4236 + }, + { + "epoch": 2.514540059347181, + "grad_norm": 0.5134148001670837, + "learning_rate": 1e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7721990346908569, + "num_tokens": 2030573460.0, + "step": 4237 + }, + { + "epoch": 2.51513353115727, + "grad_norm": 0.546116054058075, + "learning_rate": 1e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7725880146026611, + "num_tokens": 2031059977.0, + "step": 4238 + }, + { + "epoch": 2.515727002967359, + "grad_norm": 0.5461300611495972, + "learning_rate": 1e-06, + "loss": 0.6907, + "mean_token_accuracy": 0.7809433937072754, + "num_tokens": 2031555506.0, + "step": 4239 + }, + { + "epoch": 2.516320474777448, + "grad_norm": 0.5765200853347778, + "learning_rate": 1e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7613980174064636, + "num_tokens": 2031997302.0, + "step": 4240 + }, + { + "epoch": 2.516913946587537, + "grad_norm": 0.5427538156509399, + "learning_rate": 1e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.7812373042106628, + "num_tokens": 2032483679.0, + "step": 4241 + }, + { + "epoch": 2.517507418397626, + "grad_norm": 0.5298835039138794, + "learning_rate": 1e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7745842337608337, + "num_tokens": 2032995881.0, + "step": 4242 + }, + { + "epoch": 2.518100890207715, + "grad_norm": 0.5562661290168762, + "learning_rate": 1e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7704436182975769, + "num_tokens": 2033467025.0, + "step": 4243 + }, + { + "epoch": 2.518694362017804, + "grad_norm": 0.5310956239700317, + "learning_rate": 1e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7773885130882263, + "num_tokens": 2033948635.0, + "step": 4244 + }, + { + "epoch": 2.5192878338278932, + "grad_norm": 0.5445681810379028, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7642002105712891, + "num_tokens": 2034415957.0, + "step": 4245 + }, + { + "epoch": 2.5198813056379823, + "grad_norm": 0.5639756917953491, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7516268491744995, + "num_tokens": 2034884968.0, + "step": 4246 + }, + { + "epoch": 2.5204747774480714, + "grad_norm": 0.5823887586593628, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.757468044757843, + "num_tokens": 2035375616.0, + "step": 4247 + }, + { + "epoch": 2.5210682492581604, + "grad_norm": 0.5313648581504822, + "learning_rate": 1e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7537072896957397, + "num_tokens": 2035862793.0, + "step": 4248 + }, + { + "epoch": 2.5216617210682495, + "grad_norm": 0.6051337718963623, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7636032104492188, + "num_tokens": 2036287185.0, + "step": 4249 + }, + { + "epoch": 2.5222551928783385, + "grad_norm": 0.5882561206817627, + "learning_rate": 1e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.768606424331665, + "num_tokens": 2036773540.0, + "step": 4250 + }, + { + "epoch": 2.522848664688427, + "grad_norm": 0.5649267435073853, + "learning_rate": 1e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7661339044570923, + "num_tokens": 2037223846.0, + "step": 4251 + }, + { + "epoch": 2.5234421364985162, + "grad_norm": 0.5592986345291138, + "learning_rate": 1e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7765514850616455, + "num_tokens": 2037684688.0, + "step": 4252 + }, + { + "epoch": 2.5240356083086053, + "grad_norm": 0.556125283241272, + "learning_rate": 1e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7671873569488525, + "num_tokens": 2038206213.0, + "step": 4253 + }, + { + "epoch": 2.5246290801186944, + "grad_norm": 0.535408079624176, + "learning_rate": 1e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.776739239692688, + "num_tokens": 2038747368.0, + "step": 4254 + }, + { + "epoch": 2.5252225519287834, + "grad_norm": 0.5708429217338562, + "learning_rate": 1e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7561004161834717, + "num_tokens": 2039268461.0, + "step": 4255 + }, + { + "epoch": 2.5258160237388725, + "grad_norm": 0.5501343011856079, + "learning_rate": 1e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7728812098503113, + "num_tokens": 2039742509.0, + "step": 4256 + }, + { + "epoch": 2.5264094955489615, + "grad_norm": 0.5458099842071533, + "learning_rate": 1e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7811717391014099, + "num_tokens": 2040238709.0, + "step": 4257 + }, + { + "epoch": 2.5270029673590506, + "grad_norm": 0.5446484684944153, + "learning_rate": 1e-06, + "loss": 0.68, + "mean_token_accuracy": 0.7809891700744629, + "num_tokens": 2040768734.0, + "step": 4258 + }, + { + "epoch": 2.5275964391691392, + "grad_norm": 0.582836389541626, + "learning_rate": 1e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.7775822877883911, + "num_tokens": 2041241963.0, + "step": 4259 + }, + { + "epoch": 2.5281899109792283, + "grad_norm": 0.5533749461174011, + "learning_rate": 1e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7727038860321045, + "num_tokens": 2041725463.0, + "step": 4260 + }, + { + "epoch": 2.5287833827893174, + "grad_norm": 0.5533484816551208, + "learning_rate": 1e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.773338258266449, + "num_tokens": 2042208942.0, + "step": 4261 + }, + { + "epoch": 2.5293768545994064, + "grad_norm": 0.5762999653816223, + "learning_rate": 1e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.772710919380188, + "num_tokens": 2042686821.0, + "step": 4262 + }, + { + "epoch": 2.5299703264094955, + "grad_norm": 0.508275032043457, + "learning_rate": 1e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.7624536156654358, + "num_tokens": 2043199101.0, + "step": 4263 + }, + { + "epoch": 2.5305637982195845, + "grad_norm": 0.5359832644462585, + "learning_rate": 1e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.7887198328971863, + "num_tokens": 2043650637.0, + "step": 4264 + }, + { + "epoch": 2.5311572700296736, + "grad_norm": 0.555941641330719, + "learning_rate": 1e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7613180875778198, + "num_tokens": 2044129281.0, + "step": 4265 + }, + { + "epoch": 2.5317507418397627, + "grad_norm": 0.5267242193222046, + "learning_rate": 1e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7699974179267883, + "num_tokens": 2044625133.0, + "step": 4266 + }, + { + "epoch": 2.5323442136498517, + "grad_norm": 0.5546938180923462, + "learning_rate": 1e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7740144729614258, + "num_tokens": 2045112182.0, + "step": 4267 + }, + { + "epoch": 2.532937685459941, + "grad_norm": 0.5390197038650513, + "learning_rate": 1e-06, + "loss": 0.7747, + "mean_token_accuracy": 0.755811870098114, + "num_tokens": 2045575660.0, + "step": 4268 + }, + { + "epoch": 2.53353115727003, + "grad_norm": 0.5391239523887634, + "learning_rate": 1e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7736122608184814, + "num_tokens": 2046067234.0, + "step": 4269 + }, + { + "epoch": 2.534124629080119, + "grad_norm": 0.5775859951972961, + "learning_rate": 1e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7661849856376648, + "num_tokens": 2046476643.0, + "step": 4270 + }, + { + "epoch": 2.534718100890208, + "grad_norm": 0.5228913426399231, + "learning_rate": 1e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7710878849029541, + "num_tokens": 2046972677.0, + "step": 4271 + }, + { + "epoch": 2.5353115727002966, + "grad_norm": 0.5287160277366638, + "learning_rate": 1e-06, + "loss": 0.6776, + "mean_token_accuracy": 0.7812983989715576, + "num_tokens": 2047474493.0, + "step": 4272 + }, + { + "epoch": 2.5359050445103857, + "grad_norm": 0.5382966995239258, + "learning_rate": 1e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7632362842559814, + "num_tokens": 2047947042.0, + "step": 4273 + }, + { + "epoch": 2.5364985163204747, + "grad_norm": 0.5370485782623291, + "learning_rate": 1e-06, + "loss": 0.7159, + "mean_token_accuracy": 0.7715449929237366, + "num_tokens": 2048437830.0, + "step": 4274 + }, + { + "epoch": 2.537091988130564, + "grad_norm": 0.5540615320205688, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7621980309486389, + "num_tokens": 2048929079.0, + "step": 4275 + }, + { + "epoch": 2.537685459940653, + "grad_norm": 0.5449056029319763, + "learning_rate": 1e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7784699201583862, + "num_tokens": 2049402805.0, + "step": 4276 + }, + { + "epoch": 2.538278931750742, + "grad_norm": 0.5436686873435974, + "learning_rate": 1e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.772335410118103, + "num_tokens": 2049905792.0, + "step": 4277 + }, + { + "epoch": 2.538872403560831, + "grad_norm": 0.5389742851257324, + "learning_rate": 1e-06, + "loss": 0.6836, + "mean_token_accuracy": 0.7821856737136841, + "num_tokens": 2050361225.0, + "step": 4278 + }, + { + "epoch": 2.5394658753709196, + "grad_norm": 0.5607677698135376, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7632046937942505, + "num_tokens": 2050856828.0, + "step": 4279 + }, + { + "epoch": 2.5400593471810087, + "grad_norm": 0.5215904116630554, + "learning_rate": 1e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7727230787277222, + "num_tokens": 2051392369.0, + "step": 4280 + }, + { + "epoch": 2.5406528189910977, + "grad_norm": 0.5339023470878601, + "learning_rate": 1e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.765023946762085, + "num_tokens": 2051892537.0, + "step": 4281 + }, + { + "epoch": 2.541246290801187, + "grad_norm": 0.559444785118103, + "learning_rate": 1e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.7713969945907593, + "num_tokens": 2052382520.0, + "step": 4282 + }, + { + "epoch": 2.541839762611276, + "grad_norm": 0.560145378112793, + "learning_rate": 1e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7671324014663696, + "num_tokens": 2052869189.0, + "step": 4283 + }, + { + "epoch": 2.542433234421365, + "grad_norm": 0.5236634016036987, + "learning_rate": 1e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7708932161331177, + "num_tokens": 2053377411.0, + "step": 4284 + }, + { + "epoch": 2.543026706231454, + "grad_norm": 0.5728577971458435, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7650395035743713, + "num_tokens": 2053856685.0, + "step": 4285 + }, + { + "epoch": 2.543620178041543, + "grad_norm": 0.5636439919471741, + "learning_rate": 1e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7590839862823486, + "num_tokens": 2054340789.0, + "step": 4286 + }, + { + "epoch": 2.544213649851632, + "grad_norm": 0.5554097890853882, + "learning_rate": 1e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7796750068664551, + "num_tokens": 2054835189.0, + "step": 4287 + }, + { + "epoch": 2.544807121661721, + "grad_norm": 0.5380060076713562, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7764246463775635, + "num_tokens": 2055322533.0, + "step": 4288 + }, + { + "epoch": 2.5454005934718102, + "grad_norm": 0.5555359125137329, + "learning_rate": 1e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7649644613265991, + "num_tokens": 2055803879.0, + "step": 4289 + }, + { + "epoch": 2.5459940652818993, + "grad_norm": 0.5955683588981628, + "learning_rate": 1e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7614094614982605, + "num_tokens": 2056274020.0, + "step": 4290 + }, + { + "epoch": 2.5465875370919884, + "grad_norm": 0.5477399826049805, + "learning_rate": 1e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7711789011955261, + "num_tokens": 2056754930.0, + "step": 4291 + }, + { + "epoch": 2.547181008902077, + "grad_norm": 0.5197505354881287, + "learning_rate": 1e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.7727867364883423, + "num_tokens": 2057230676.0, + "step": 4292 + }, + { + "epoch": 2.547774480712166, + "grad_norm": 0.5326845049858093, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7674063444137573, + "num_tokens": 2057765657.0, + "step": 4293 + }, + { + "epoch": 2.548367952522255, + "grad_norm": 0.5123476982116699, + "learning_rate": 1e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.766883373260498, + "num_tokens": 2058287068.0, + "step": 4294 + }, + { + "epoch": 2.548961424332344, + "grad_norm": 0.5491666197776794, + "learning_rate": 1e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7744156718254089, + "num_tokens": 2058787443.0, + "step": 4295 + }, + { + "epoch": 2.5495548961424332, + "grad_norm": 0.5660656690597534, + "learning_rate": 1e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7635278701782227, + "num_tokens": 2059257773.0, + "step": 4296 + }, + { + "epoch": 2.5501483679525223, + "grad_norm": 0.5306099653244019, + "learning_rate": 1e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7700533866882324, + "num_tokens": 2059734226.0, + "step": 4297 + }, + { + "epoch": 2.5507418397626114, + "grad_norm": 0.5718083381652832, + "learning_rate": 1e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7688983082771301, + "num_tokens": 2060202665.0, + "step": 4298 + }, + { + "epoch": 2.5513353115727004, + "grad_norm": 0.5218863487243652, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7751072645187378, + "num_tokens": 2060706669.0, + "step": 4299 + }, + { + "epoch": 2.551928783382789, + "grad_norm": 0.5614277124404907, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7632859945297241, + "num_tokens": 2061158375.0, + "step": 4300 + }, + { + "epoch": 2.552522255192878, + "grad_norm": 0.5212868452072144, + "learning_rate": 1e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7655994296073914, + "num_tokens": 2061695178.0, + "step": 4301 + }, + { + "epoch": 2.553115727002967, + "grad_norm": 0.5894176363945007, + "learning_rate": 1e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7660810947418213, + "num_tokens": 2062139334.0, + "step": 4302 + }, + { + "epoch": 2.5537091988130562, + "grad_norm": 0.5537680983543396, + "learning_rate": 1e-06, + "loss": 0.6715, + "mean_token_accuracy": 0.7841846942901611, + "num_tokens": 2062589421.0, + "step": 4303 + }, + { + "epoch": 2.5543026706231453, + "grad_norm": 0.5700660943984985, + "learning_rate": 1e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7568982839584351, + "num_tokens": 2063053928.0, + "step": 4304 + }, + { + "epoch": 2.5548961424332344, + "grad_norm": 0.5490999817848206, + "learning_rate": 1e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7759829759597778, + "num_tokens": 2063508245.0, + "step": 4305 + }, + { + "epoch": 2.5554896142433234, + "grad_norm": 0.5864801406860352, + "learning_rate": 1e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7803785800933838, + "num_tokens": 2063984068.0, + "step": 4306 + }, + { + "epoch": 2.5560830860534125, + "grad_norm": 0.5560380220413208, + "learning_rate": 1e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7808977961540222, + "num_tokens": 2064477337.0, + "step": 4307 + }, + { + "epoch": 2.5566765578635016, + "grad_norm": 0.5741890072822571, + "learning_rate": 1e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.759781002998352, + "num_tokens": 2064957957.0, + "step": 4308 + }, + { + "epoch": 2.5572700296735906, + "grad_norm": 0.5978379845619202, + "learning_rate": 1e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7629692554473877, + "num_tokens": 2065398468.0, + "step": 4309 + }, + { + "epoch": 2.5578635014836797, + "grad_norm": 0.5731708407402039, + "learning_rate": 1e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7600884437561035, + "num_tokens": 2065868357.0, + "step": 4310 + }, + { + "epoch": 2.5584569732937688, + "grad_norm": 0.5591439604759216, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7699013948440552, + "num_tokens": 2066311996.0, + "step": 4311 + }, + { + "epoch": 2.559050445103858, + "grad_norm": 0.5729967951774597, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7584421038627625, + "num_tokens": 2066744800.0, + "step": 4312 + }, + { + "epoch": 2.5596439169139464, + "grad_norm": 0.602703332901001, + "learning_rate": 1e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7655565738677979, + "num_tokens": 2067227843.0, + "step": 4313 + }, + { + "epoch": 2.5602373887240355, + "grad_norm": 0.5748547911643982, + "learning_rate": 1e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7619083523750305, + "num_tokens": 2067663328.0, + "step": 4314 + }, + { + "epoch": 2.5608308605341246, + "grad_norm": 0.543803870677948, + "learning_rate": 1e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7707487940788269, + "num_tokens": 2068136046.0, + "step": 4315 + }, + { + "epoch": 2.5614243323442136, + "grad_norm": 0.5294701457023621, + "learning_rate": 1e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7667198777198792, + "num_tokens": 2068627531.0, + "step": 4316 + }, + { + "epoch": 2.5620178041543027, + "grad_norm": 0.5150215029716492, + "learning_rate": 1e-06, + "loss": 0.6836, + "mean_token_accuracy": 0.7807215452194214, + "num_tokens": 2069136073.0, + "step": 4317 + }, + { + "epoch": 2.5626112759643918, + "grad_norm": 0.559866726398468, + "learning_rate": 1e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.7831765413284302, + "num_tokens": 2069577167.0, + "step": 4318 + }, + { + "epoch": 2.563204747774481, + "grad_norm": 0.6011133193969727, + "learning_rate": 1e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7566823959350586, + "num_tokens": 2070013059.0, + "step": 4319 + }, + { + "epoch": 2.56379821958457, + "grad_norm": 0.5683116316795349, + "learning_rate": 1e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7788428068161011, + "num_tokens": 2070497905.0, + "step": 4320 + }, + { + "epoch": 2.5643916913946585, + "grad_norm": 0.5186798572540283, + "learning_rate": 1e-06, + "loss": 0.6811, + "mean_token_accuracy": 0.7828925848007202, + "num_tokens": 2071003544.0, + "step": 4321 + }, + { + "epoch": 2.5649851632047476, + "grad_norm": 0.5584602355957031, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.765078067779541, + "num_tokens": 2071478096.0, + "step": 4322 + }, + { + "epoch": 2.5655786350148366, + "grad_norm": 0.5292479395866394, + "learning_rate": 1e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.774307370185852, + "num_tokens": 2071981480.0, + "step": 4323 + }, + { + "epoch": 2.5661721068249257, + "grad_norm": 0.5647213459014893, + "learning_rate": 1e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7604223489761353, + "num_tokens": 2072459211.0, + "step": 4324 + }, + { + "epoch": 2.5667655786350148, + "grad_norm": 0.5592495799064636, + "learning_rate": 1e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7718607187271118, + "num_tokens": 2072905930.0, + "step": 4325 + }, + { + "epoch": 2.567359050445104, + "grad_norm": 0.5466105937957764, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7535587549209595, + "num_tokens": 2073405928.0, + "step": 4326 + }, + { + "epoch": 2.567952522255193, + "grad_norm": 0.5537831783294678, + "learning_rate": 1e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7742525339126587, + "num_tokens": 2073902168.0, + "step": 4327 + }, + { + "epoch": 2.568545994065282, + "grad_norm": 0.5154587626457214, + "learning_rate": 1e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7819734215736389, + "num_tokens": 2074444636.0, + "step": 4328 + }, + { + "epoch": 2.569139465875371, + "grad_norm": 0.5730665922164917, + "learning_rate": 1e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.776239275932312, + "num_tokens": 2074914473.0, + "step": 4329 + }, + { + "epoch": 2.56973293768546, + "grad_norm": 0.5454201698303223, + "learning_rate": 1e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7749007344245911, + "num_tokens": 2075403575.0, + "step": 4330 + }, + { + "epoch": 2.570326409495549, + "grad_norm": 0.535434901714325, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7627840042114258, + "num_tokens": 2075898598.0, + "step": 4331 + }, + { + "epoch": 2.570919881305638, + "grad_norm": 0.5840641856193542, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7657279968261719, + "num_tokens": 2076347821.0, + "step": 4332 + }, + { + "epoch": 2.5715133531157273, + "grad_norm": 0.5782315135002136, + "learning_rate": 1e-06, + "loss": 0.687, + "mean_token_accuracy": 0.779086709022522, + "num_tokens": 2076803667.0, + "step": 4333 + }, + { + "epoch": 2.572106824925816, + "grad_norm": 0.5499964952468872, + "learning_rate": 1e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7748351693153381, + "num_tokens": 2077270984.0, + "step": 4334 + }, + { + "epoch": 2.572700296735905, + "grad_norm": 0.5564629435539246, + "learning_rate": 1e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.775313138961792, + "num_tokens": 2077774978.0, + "step": 4335 + }, + { + "epoch": 2.573293768545994, + "grad_norm": 0.5177892446517944, + "learning_rate": 1e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7688919305801392, + "num_tokens": 2078298201.0, + "step": 4336 + }, + { + "epoch": 2.573887240356083, + "grad_norm": 0.5611860752105713, + "learning_rate": 1e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.7872171401977539, + "num_tokens": 2078781502.0, + "step": 4337 + }, + { + "epoch": 2.574480712166172, + "grad_norm": 0.55203777551651, + "learning_rate": 1e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7576354146003723, + "num_tokens": 2079304480.0, + "step": 4338 + }, + { + "epoch": 2.575074183976261, + "grad_norm": 0.5460752844810486, + "learning_rate": 1e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7717815637588501, + "num_tokens": 2079787004.0, + "step": 4339 + }, + { + "epoch": 2.5756676557863503, + "grad_norm": 0.5629968047142029, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7618473172187805, + "num_tokens": 2080250433.0, + "step": 4340 + }, + { + "epoch": 2.5762611275964393, + "grad_norm": 0.5189142227172852, + "learning_rate": 1e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.7849926948547363, + "num_tokens": 2080756788.0, + "step": 4341 + }, + { + "epoch": 2.576854599406528, + "grad_norm": 0.5595281720161438, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7649587392807007, + "num_tokens": 2081236734.0, + "step": 4342 + }, + { + "epoch": 2.577448071216617, + "grad_norm": 0.5454772114753723, + "learning_rate": 1e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7679933905601501, + "num_tokens": 2081723855.0, + "step": 4343 + }, + { + "epoch": 2.578041543026706, + "grad_norm": 0.5322665572166443, + "learning_rate": 1e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7675818204879761, + "num_tokens": 2082215441.0, + "step": 4344 + }, + { + "epoch": 2.578635014836795, + "grad_norm": 0.6038748621940613, + "learning_rate": 1e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7639402151107788, + "num_tokens": 2082649431.0, + "step": 4345 + }, + { + "epoch": 2.579228486646884, + "grad_norm": 0.5264197587966919, + "learning_rate": 1e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7745280861854553, + "num_tokens": 2083154965.0, + "step": 4346 + }, + { + "epoch": 2.5798219584569733, + "grad_norm": 0.5319514870643616, + "learning_rate": 1e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7808377742767334, + "num_tokens": 2083609216.0, + "step": 4347 + }, + { + "epoch": 2.5804154302670623, + "grad_norm": 0.5332094430923462, + "learning_rate": 1e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.777125358581543, + "num_tokens": 2084080071.0, + "step": 4348 + }, + { + "epoch": 2.5810089020771514, + "grad_norm": 0.5454282164573669, + "learning_rate": 1e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7669280171394348, + "num_tokens": 2084559059.0, + "step": 4349 + }, + { + "epoch": 2.5816023738872405, + "grad_norm": 0.5249475240707397, + "learning_rate": 1e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7729687690734863, + "num_tokens": 2085067841.0, + "step": 4350 + }, + { + "epoch": 2.5821958456973295, + "grad_norm": 0.5522579550743103, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7703127264976501, + "num_tokens": 2085553494.0, + "step": 4351 + }, + { + "epoch": 2.5827893175074186, + "grad_norm": 0.53175950050354, + "learning_rate": 1e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7702901363372803, + "num_tokens": 2086065776.0, + "step": 4352 + }, + { + "epoch": 2.5833827893175076, + "grad_norm": 0.5503483414649963, + "learning_rate": 1e-06, + "loss": 0.7578, + "mean_token_accuracy": 0.760898232460022, + "num_tokens": 2086581960.0, + "step": 4353 + }, + { + "epoch": 2.5839762611275967, + "grad_norm": 0.5340427756309509, + "learning_rate": 1e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.7750740051269531, + "num_tokens": 2087080296.0, + "step": 4354 + }, + { + "epoch": 2.5845697329376853, + "grad_norm": 0.5376059412956238, + "learning_rate": 1e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7557923197746277, + "num_tokens": 2087539191.0, + "step": 4355 + }, + { + "epoch": 2.5851632047477744, + "grad_norm": 0.561637818813324, + "learning_rate": 1e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.7768417000770569, + "num_tokens": 2088012294.0, + "step": 4356 + }, + { + "epoch": 2.5857566765578635, + "grad_norm": 0.6119292378425598, + "learning_rate": 1e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7697444558143616, + "num_tokens": 2088452959.0, + "step": 4357 + }, + { + "epoch": 2.5863501483679525, + "grad_norm": 0.5360332727432251, + "learning_rate": 1e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7768056392669678, + "num_tokens": 2088942364.0, + "step": 4358 + }, + { + "epoch": 2.5869436201780416, + "grad_norm": 0.5571079254150391, + "learning_rate": 1e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7647006511688232, + "num_tokens": 2089425704.0, + "step": 4359 + }, + { + "epoch": 2.5875370919881306, + "grad_norm": 0.5669910907745361, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7678529620170593, + "num_tokens": 2089926520.0, + "step": 4360 + }, + { + "epoch": 2.5881305637982197, + "grad_norm": 0.58609539270401, + "learning_rate": 1e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7735977172851562, + "num_tokens": 2090390652.0, + "step": 4361 + }, + { + "epoch": 2.5887240356083088, + "grad_norm": 0.5327203273773193, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7726485133171082, + "num_tokens": 2090890967.0, + "step": 4362 + }, + { + "epoch": 2.5893175074183974, + "grad_norm": 0.5391127467155457, + "learning_rate": 1e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7656034231185913, + "num_tokens": 2091383245.0, + "step": 4363 + }, + { + "epoch": 2.5899109792284865, + "grad_norm": 0.5568010210990906, + "learning_rate": 1e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.780106782913208, + "num_tokens": 2091862694.0, + "step": 4364 + }, + { + "epoch": 2.5905044510385755, + "grad_norm": 0.5656195282936096, + "learning_rate": 1e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7827301025390625, + "num_tokens": 2092335154.0, + "step": 4365 + }, + { + "epoch": 2.5910979228486646, + "grad_norm": 0.5278459787368774, + "learning_rate": 1e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7759736776351929, + "num_tokens": 2092874195.0, + "step": 4366 + }, + { + "epoch": 2.5916913946587536, + "grad_norm": 0.5417872667312622, + "learning_rate": 1e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.7839713096618652, + "num_tokens": 2093312885.0, + "step": 4367 + }, + { + "epoch": 2.5922848664688427, + "grad_norm": 0.5544756054878235, + "learning_rate": 1e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7736678719520569, + "num_tokens": 2093830875.0, + "step": 4368 + }, + { + "epoch": 2.5928783382789318, + "grad_norm": 0.5859412550926208, + "learning_rate": 1e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7680261135101318, + "num_tokens": 2094274882.0, + "step": 4369 + }, + { + "epoch": 2.593471810089021, + "grad_norm": 0.6010940074920654, + "learning_rate": 1e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7741338014602661, + "num_tokens": 2094683901.0, + "step": 4370 + }, + { + "epoch": 2.59406528189911, + "grad_norm": 0.5622941851615906, + "learning_rate": 1e-06, + "loss": 0.7504, + "mean_token_accuracy": 0.7622735500335693, + "num_tokens": 2095138376.0, + "step": 4371 + }, + { + "epoch": 2.594658753709199, + "grad_norm": 0.6662549376487732, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7611069679260254, + "num_tokens": 2095563046.0, + "step": 4372 + }, + { + "epoch": 2.595252225519288, + "grad_norm": 0.5575079321861267, + "learning_rate": 1e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.7722209095954895, + "num_tokens": 2096039150.0, + "step": 4373 + }, + { + "epoch": 2.595845697329377, + "grad_norm": 0.558754026889801, + "learning_rate": 1e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7775630354881287, + "num_tokens": 2096541894.0, + "step": 4374 + }, + { + "epoch": 2.596439169139466, + "grad_norm": 0.5479133725166321, + "learning_rate": 1e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.7888270616531372, + "num_tokens": 2097019889.0, + "step": 4375 + }, + { + "epoch": 2.5970326409495548, + "grad_norm": 0.5502229928970337, + "learning_rate": 1e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.776343822479248, + "num_tokens": 2097523636.0, + "step": 4376 + }, + { + "epoch": 2.597626112759644, + "grad_norm": 0.5270708203315735, + "learning_rate": 1e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7672982215881348, + "num_tokens": 2098012536.0, + "step": 4377 + }, + { + "epoch": 2.598219584569733, + "grad_norm": 0.5397940874099731, + "learning_rate": 1e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7749629616737366, + "num_tokens": 2098497977.0, + "step": 4378 + }, + { + "epoch": 2.598813056379822, + "grad_norm": 0.5450974702835083, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7655069231987, + "num_tokens": 2098958180.0, + "step": 4379 + }, + { + "epoch": 2.599406528189911, + "grad_norm": 0.5323835611343384, + "learning_rate": 1e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7785369157791138, + "num_tokens": 2099437795.0, + "step": 4380 + }, + { + "epoch": 2.6, + "grad_norm": 0.5680884718894958, + "learning_rate": 1e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7665705680847168, + "num_tokens": 2099904795.0, + "step": 4381 + }, + { + "epoch": 2.600593471810089, + "grad_norm": 0.5287861227989197, + "learning_rate": 1e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7683840394020081, + "num_tokens": 2100439504.0, + "step": 4382 + }, + { + "epoch": 2.6011869436201778, + "grad_norm": 0.5231912136077881, + "learning_rate": 1e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7686726450920105, + "num_tokens": 2100942841.0, + "step": 4383 + }, + { + "epoch": 2.601780415430267, + "grad_norm": 0.5465995669364929, + "learning_rate": 1e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7718222737312317, + "num_tokens": 2101403174.0, + "step": 4384 + }, + { + "epoch": 2.602373887240356, + "grad_norm": 0.5570071339607239, + "learning_rate": 1e-06, + "loss": 0.6171, + "mean_token_accuracy": 0.79710853099823, + "num_tokens": 2101856711.0, + "step": 4385 + }, + { + "epoch": 2.602967359050445, + "grad_norm": 0.5462626814842224, + "learning_rate": 1e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7588030099868774, + "num_tokens": 2102322641.0, + "step": 4386 + }, + { + "epoch": 2.603560830860534, + "grad_norm": 0.5562899112701416, + "learning_rate": 1e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7711648941040039, + "num_tokens": 2102782244.0, + "step": 4387 + }, + { + "epoch": 2.604154302670623, + "grad_norm": 0.5439364910125732, + "learning_rate": 1e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7724833488464355, + "num_tokens": 2103305473.0, + "step": 4388 + }, + { + "epoch": 2.604747774480712, + "grad_norm": 0.526123583316803, + "learning_rate": 1e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7586967945098877, + "num_tokens": 2103795338.0, + "step": 4389 + }, + { + "epoch": 2.605341246290801, + "grad_norm": 0.5516970753669739, + "learning_rate": 1e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.7786704301834106, + "num_tokens": 2104275965.0, + "step": 4390 + }, + { + "epoch": 2.6059347181008903, + "grad_norm": 0.5640460252761841, + "learning_rate": 1e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.7759882807731628, + "num_tokens": 2104742642.0, + "step": 4391 + }, + { + "epoch": 2.6065281899109793, + "grad_norm": 0.5460849404335022, + "learning_rate": 1e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7757664322853088, + "num_tokens": 2105235846.0, + "step": 4392 + }, + { + "epoch": 2.6071216617210684, + "grad_norm": 0.5367692112922668, + "learning_rate": 1e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7686811685562134, + "num_tokens": 2105699769.0, + "step": 4393 + }, + { + "epoch": 2.6077151335311575, + "grad_norm": 0.5625526905059814, + "learning_rate": 1e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7629016041755676, + "num_tokens": 2106184962.0, + "step": 4394 + }, + { + "epoch": 2.6083086053412465, + "grad_norm": 0.5265967845916748, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7612524628639221, + "num_tokens": 2106694435.0, + "step": 4395 + }, + { + "epoch": 2.608902077151335, + "grad_norm": 0.5582574605941772, + "learning_rate": 1e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7683340311050415, + "num_tokens": 2107164428.0, + "step": 4396 + }, + { + "epoch": 2.609495548961424, + "grad_norm": 0.517877995967865, + "learning_rate": 1e-06, + "loss": 0.6567, + "mean_token_accuracy": 0.7892553210258484, + "num_tokens": 2107681279.0, + "step": 4397 + }, + { + "epoch": 2.6100890207715133, + "grad_norm": 0.5219470858573914, + "learning_rate": 1e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7853761911392212, + "num_tokens": 2108217973.0, + "step": 4398 + }, + { + "epoch": 2.6106824925816023, + "grad_norm": 0.546388566493988, + "learning_rate": 1e-06, + "loss": 0.6525, + "mean_token_accuracy": 0.7891642451286316, + "num_tokens": 2108681020.0, + "step": 4399 + }, + { + "epoch": 2.6112759643916914, + "grad_norm": 0.5584214329719543, + "learning_rate": 1e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7731592655181885, + "num_tokens": 2109133347.0, + "step": 4400 + }, + { + "epoch": 2.6118694362017805, + "grad_norm": 0.5465847849845886, + "learning_rate": 1e-06, + "loss": 0.7504, + "mean_token_accuracy": 0.7610862255096436, + "num_tokens": 2109621103.0, + "step": 4401 + }, + { + "epoch": 2.6124629080118695, + "grad_norm": 0.5513139367103577, + "learning_rate": 1e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7747310400009155, + "num_tokens": 2110103816.0, + "step": 4402 + }, + { + "epoch": 2.6130563798219586, + "grad_norm": 0.564856231212616, + "learning_rate": 1e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.78151535987854, + "num_tokens": 2110554955.0, + "step": 4403 + }, + { + "epoch": 2.613649851632047, + "grad_norm": 0.5410582423210144, + "learning_rate": 1e-06, + "loss": 0.6399, + "mean_token_accuracy": 0.7913689613342285, + "num_tokens": 2111072661.0, + "step": 4404 + }, + { + "epoch": 2.6142433234421363, + "grad_norm": 0.5522658824920654, + "learning_rate": 1e-06, + "loss": 0.651, + "mean_token_accuracy": 0.7898847460746765, + "num_tokens": 2111565617.0, + "step": 4405 + }, + { + "epoch": 2.6148367952522253, + "grad_norm": 0.5197257995605469, + "learning_rate": 1e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7615408301353455, + "num_tokens": 2112081074.0, + "step": 4406 + }, + { + "epoch": 2.6154302670623144, + "grad_norm": 0.5499061942100525, + "learning_rate": 1e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.7753971219062805, + "num_tokens": 2112532800.0, + "step": 4407 + }, + { + "epoch": 2.6160237388724035, + "grad_norm": 0.595603346824646, + "learning_rate": 1e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7673754692077637, + "num_tokens": 2112983854.0, + "step": 4408 + }, + { + "epoch": 2.6166172106824925, + "grad_norm": 0.5327318906784058, + "learning_rate": 1e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7795518636703491, + "num_tokens": 2113506686.0, + "step": 4409 + }, + { + "epoch": 2.6172106824925816, + "grad_norm": 0.5331252813339233, + "learning_rate": 1e-06, + "loss": 0.696, + "mean_token_accuracy": 0.7775453329086304, + "num_tokens": 2114032006.0, + "step": 4410 + }, + { + "epoch": 2.6178041543026707, + "grad_norm": 0.5376008152961731, + "learning_rate": 1e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7565277814865112, + "num_tokens": 2114519456.0, + "step": 4411 + }, + { + "epoch": 2.6183976261127597, + "grad_norm": 0.5910245180130005, + "learning_rate": 1e-06, + "loss": 0.6818, + "mean_token_accuracy": 0.7812201380729675, + "num_tokens": 2114933525.0, + "step": 4412 + }, + { + "epoch": 2.618991097922849, + "grad_norm": 0.5354018807411194, + "learning_rate": 1e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.7830808162689209, + "num_tokens": 2115413059.0, + "step": 4413 + }, + { + "epoch": 2.619584569732938, + "grad_norm": 0.5375077724456787, + "learning_rate": 1e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7753409147262573, + "num_tokens": 2115933721.0, + "step": 4414 + }, + { + "epoch": 2.620178041543027, + "grad_norm": 0.539271891117096, + "learning_rate": 1e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.761168897151947, + "num_tokens": 2116436141.0, + "step": 4415 + }, + { + "epoch": 2.620771513353116, + "grad_norm": 0.5419471263885498, + "learning_rate": 1e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7809984087944031, + "num_tokens": 2116935573.0, + "step": 4416 + }, + { + "epoch": 2.6213649851632046, + "grad_norm": 0.5401575565338135, + "learning_rate": 1e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.7791731357574463, + "num_tokens": 2117370261.0, + "step": 4417 + }, + { + "epoch": 2.6219584569732937, + "grad_norm": 0.5293732285499573, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7712768316268921, + "num_tokens": 2117858289.0, + "step": 4418 + }, + { + "epoch": 2.6225519287833827, + "grad_norm": 0.5693888664245605, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7539715766906738, + "num_tokens": 2118336761.0, + "step": 4419 + }, + { + "epoch": 2.623145400593472, + "grad_norm": 0.5588921904563904, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7605443000793457, + "num_tokens": 2118788148.0, + "step": 4420 + }, + { + "epoch": 2.623738872403561, + "grad_norm": 0.5506988167762756, + "learning_rate": 1e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7541797161102295, + "num_tokens": 2119254585.0, + "step": 4421 + }, + { + "epoch": 2.62433234421365, + "grad_norm": 0.5199070572853088, + "learning_rate": 1e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7668620944023132, + "num_tokens": 2119744228.0, + "step": 4422 + }, + { + "epoch": 2.624925816023739, + "grad_norm": 0.5143157243728638, + "learning_rate": 1e-06, + "loss": 0.6358, + "mean_token_accuracy": 0.7940330505371094, + "num_tokens": 2120254605.0, + "step": 4423 + }, + { + "epoch": 2.625519287833828, + "grad_norm": 0.5498684644699097, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7763984799385071, + "num_tokens": 2120721598.0, + "step": 4424 + }, + { + "epoch": 2.6261127596439167, + "grad_norm": 0.5721725821495056, + "learning_rate": 1e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7754095792770386, + "num_tokens": 2121207472.0, + "step": 4425 + }, + { + "epoch": 2.6267062314540057, + "grad_norm": 0.5452777743339539, + "learning_rate": 1e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7630609273910522, + "num_tokens": 2121693083.0, + "step": 4426 + }, + { + "epoch": 2.627299703264095, + "grad_norm": 0.5278748869895935, + "learning_rate": 1e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7736173868179321, + "num_tokens": 2122205619.0, + "step": 4427 + }, + { + "epoch": 2.627893175074184, + "grad_norm": 0.5925109386444092, + "learning_rate": 1e-06, + "loss": 0.696, + "mean_token_accuracy": 0.7753673195838928, + "num_tokens": 2122632887.0, + "step": 4428 + }, + { + "epoch": 2.628486646884273, + "grad_norm": 0.5635607242584229, + "learning_rate": 1e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.757335901260376, + "num_tokens": 2123103580.0, + "step": 4429 + }, + { + "epoch": 2.629080118694362, + "grad_norm": 0.5519684553146362, + "learning_rate": 1e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7750803232192993, + "num_tokens": 2123562717.0, + "step": 4430 + }, + { + "epoch": 2.629673590504451, + "grad_norm": 0.547781765460968, + "learning_rate": 1e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.7778217792510986, + "num_tokens": 2124023318.0, + "step": 4431 + }, + { + "epoch": 2.63026706231454, + "grad_norm": 0.5620453953742981, + "learning_rate": 1e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.781268835067749, + "num_tokens": 2124478670.0, + "step": 4432 + }, + { + "epoch": 2.630860534124629, + "grad_norm": 0.5313220024108887, + "learning_rate": 1e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7809919118881226, + "num_tokens": 2124947393.0, + "step": 4433 + }, + { + "epoch": 2.6314540059347182, + "grad_norm": 0.5594726800918579, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7654274106025696, + "num_tokens": 2125434687.0, + "step": 4434 + }, + { + "epoch": 2.6320474777448073, + "grad_norm": 0.5524805784225464, + "learning_rate": 1e-06, + "loss": 0.697, + "mean_token_accuracy": 0.7750402688980103, + "num_tokens": 2125909938.0, + "step": 4435 + }, + { + "epoch": 2.6326409495548964, + "grad_norm": 0.5517454147338867, + "learning_rate": 1e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.763736367225647, + "num_tokens": 2126373520.0, + "step": 4436 + }, + { + "epoch": 2.6332344213649854, + "grad_norm": 0.55029296875, + "learning_rate": 1e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7748420238494873, + "num_tokens": 2126841079.0, + "step": 4437 + }, + { + "epoch": 2.633827893175074, + "grad_norm": 0.5527129769325256, + "learning_rate": 1e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7679656147956848, + "num_tokens": 2127301628.0, + "step": 4438 + }, + { + "epoch": 2.634421364985163, + "grad_norm": 0.5362277626991272, + "learning_rate": 1e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7688373923301697, + "num_tokens": 2127778257.0, + "step": 4439 + }, + { + "epoch": 2.635014836795252, + "grad_norm": 0.5659182071685791, + "learning_rate": 1e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.7708295583724976, + "num_tokens": 2128257957.0, + "step": 4440 + }, + { + "epoch": 2.6356083086053412, + "grad_norm": 0.5623230934143066, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7630596160888672, + "num_tokens": 2128707450.0, + "step": 4441 + }, + { + "epoch": 2.6362017804154303, + "grad_norm": 0.529935359954834, + "learning_rate": 1e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7692986726760864, + "num_tokens": 2129212221.0, + "step": 4442 + }, + { + "epoch": 2.6367952522255194, + "grad_norm": 0.5580366849899292, + "learning_rate": 1e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7757619023323059, + "num_tokens": 2129712854.0, + "step": 4443 + }, + { + "epoch": 2.6373887240356084, + "grad_norm": 0.5176530480384827, + "learning_rate": 1e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7708568572998047, + "num_tokens": 2130218775.0, + "step": 4444 + }, + { + "epoch": 2.6379821958456975, + "grad_norm": 0.5155560374259949, + "learning_rate": 1e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7603247165679932, + "num_tokens": 2130740722.0, + "step": 4445 + }, + { + "epoch": 2.638575667655786, + "grad_norm": 0.5279533267021179, + "learning_rate": 1e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7766878008842468, + "num_tokens": 2131230138.0, + "step": 4446 + }, + { + "epoch": 2.639169139465875, + "grad_norm": 0.5755109190940857, + "learning_rate": 1e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7763630151748657, + "num_tokens": 2131696586.0, + "step": 4447 + }, + { + "epoch": 2.6397626112759642, + "grad_norm": 0.5944138765335083, + "learning_rate": 1e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7681285738945007, + "num_tokens": 2132131174.0, + "step": 4448 + }, + { + "epoch": 2.6403560830860533, + "grad_norm": 0.566120982170105, + "learning_rate": 1e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7582179307937622, + "num_tokens": 2132606565.0, + "step": 4449 + }, + { + "epoch": 2.6409495548961424, + "grad_norm": 0.5399830937385559, + "learning_rate": 1e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.779267430305481, + "num_tokens": 2133122308.0, + "step": 4450 + }, + { + "epoch": 2.6415430267062314, + "grad_norm": 0.5674878358840942, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.765042245388031, + "num_tokens": 2133577520.0, + "step": 4451 + }, + { + "epoch": 2.6421364985163205, + "grad_norm": 0.5435426235198975, + "learning_rate": 1e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7707692384719849, + "num_tokens": 2134077467.0, + "step": 4452 + }, + { + "epoch": 2.6427299703264095, + "grad_norm": 0.5859962701797485, + "learning_rate": 1e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7622036337852478, + "num_tokens": 2134566245.0, + "step": 4453 + }, + { + "epoch": 2.6433234421364986, + "grad_norm": 0.5719574093818665, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7646094560623169, + "num_tokens": 2135052384.0, + "step": 4454 + }, + { + "epoch": 2.6439169139465877, + "grad_norm": 0.5298524498939514, + "learning_rate": 1e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.7685421109199524, + "num_tokens": 2135537333.0, + "step": 4455 + }, + { + "epoch": 2.6445103857566767, + "grad_norm": 0.5763670802116394, + "learning_rate": 1e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7786422967910767, + "num_tokens": 2135988105.0, + "step": 4456 + }, + { + "epoch": 2.645103857566766, + "grad_norm": 0.5718287825584412, + "learning_rate": 1e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.7539457082748413, + "num_tokens": 2136459859.0, + "step": 4457 + }, + { + "epoch": 2.645697329376855, + "grad_norm": 0.5468531250953674, + "learning_rate": 1e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7624280452728271, + "num_tokens": 2136956256.0, + "step": 4458 + }, + { + "epoch": 2.6462908011869435, + "grad_norm": 0.5429820418357849, + "learning_rate": 1e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7722588777542114, + "num_tokens": 2137438418.0, + "step": 4459 + }, + { + "epoch": 2.6468842729970326, + "grad_norm": 0.5037072896957397, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7702155113220215, + "num_tokens": 2137961773.0, + "step": 4460 + }, + { + "epoch": 2.6474777448071216, + "grad_norm": 0.5291022062301636, + "learning_rate": 1e-06, + "loss": 0.6825, + "mean_token_accuracy": 0.7806909084320068, + "num_tokens": 2138453588.0, + "step": 4461 + }, + { + "epoch": 2.6480712166172107, + "grad_norm": 0.5541940331459045, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7499021291732788, + "num_tokens": 2138931876.0, + "step": 4462 + }, + { + "epoch": 2.6486646884272997, + "grad_norm": 0.5503678321838379, + "learning_rate": 1e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7776084542274475, + "num_tokens": 2139400738.0, + "step": 4463 + }, + { + "epoch": 2.649258160237389, + "grad_norm": 0.5165129899978638, + "learning_rate": 1e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7755453586578369, + "num_tokens": 2139898193.0, + "step": 4464 + }, + { + "epoch": 2.649851632047478, + "grad_norm": 0.5495750904083252, + "learning_rate": 1e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7670576572418213, + "num_tokens": 2140382831.0, + "step": 4465 + }, + { + "epoch": 2.650445103857567, + "grad_norm": 0.5689319968223572, + "learning_rate": 1e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7697486877441406, + "num_tokens": 2140820059.0, + "step": 4466 + }, + { + "epoch": 2.6510385756676556, + "grad_norm": 0.5521285533905029, + "learning_rate": 1e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7731491327285767, + "num_tokens": 2141297563.0, + "step": 4467 + }, + { + "epoch": 2.6516320474777446, + "grad_norm": 0.5780566334724426, + "learning_rate": 1e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.7699669599533081, + "num_tokens": 2141746264.0, + "step": 4468 + }, + { + "epoch": 2.6522255192878337, + "grad_norm": 0.5654661059379578, + "learning_rate": 1e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.7652270793914795, + "num_tokens": 2142219428.0, + "step": 4469 + }, + { + "epoch": 2.6528189910979227, + "grad_norm": 0.5536571145057678, + "learning_rate": 1e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.771282970905304, + "num_tokens": 2142697613.0, + "step": 4470 + }, + { + "epoch": 2.653412462908012, + "grad_norm": 0.5645928978919983, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7684829235076904, + "num_tokens": 2143146708.0, + "step": 4471 + }, + { + "epoch": 2.654005934718101, + "grad_norm": 0.5244740843772888, + "learning_rate": 1e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7822854518890381, + "num_tokens": 2143654815.0, + "step": 4472 + }, + { + "epoch": 2.65459940652819, + "grad_norm": 0.571284830570221, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7580597400665283, + "num_tokens": 2144123646.0, + "step": 4473 + }, + { + "epoch": 2.655192878338279, + "grad_norm": 0.5695533752441406, + "learning_rate": 1e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7742816805839539, + "num_tokens": 2144611923.0, + "step": 4474 + }, + { + "epoch": 2.655786350148368, + "grad_norm": 0.5439967513084412, + "learning_rate": 1e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7806957960128784, + "num_tokens": 2145050102.0, + "step": 4475 + }, + { + "epoch": 2.656379821958457, + "grad_norm": 0.5199092626571655, + "learning_rate": 1e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7757956981658936, + "num_tokens": 2145560721.0, + "step": 4476 + }, + { + "epoch": 2.656973293768546, + "grad_norm": 0.5519347786903381, + "learning_rate": 1e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7729567289352417, + "num_tokens": 2146047260.0, + "step": 4477 + }, + { + "epoch": 2.6575667655786352, + "grad_norm": 0.5653048753738403, + "learning_rate": 1e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7669320106506348, + "num_tokens": 2146508934.0, + "step": 4478 + }, + { + "epoch": 2.6581602373887243, + "grad_norm": 0.5478739738464355, + "learning_rate": 1e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7661042809486389, + "num_tokens": 2146990369.0, + "step": 4479 + }, + { + "epoch": 2.658753709198813, + "grad_norm": 0.5518019795417786, + "learning_rate": 1e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7782436013221741, + "num_tokens": 2147475097.0, + "step": 4480 + }, + { + "epoch": 2.659347181008902, + "grad_norm": 0.5601062178611755, + "learning_rate": 1e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7647498250007629, + "num_tokens": 2147927790.0, + "step": 4481 + }, + { + "epoch": 2.659940652818991, + "grad_norm": 0.5397257804870605, + "learning_rate": 1e-06, + "loss": 0.68, + "mean_token_accuracy": 0.7816632986068726, + "num_tokens": 2148394661.0, + "step": 4482 + }, + { + "epoch": 2.66053412462908, + "grad_norm": 0.5597017407417297, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7601025700569153, + "num_tokens": 2148844570.0, + "step": 4483 + }, + { + "epoch": 2.661127596439169, + "grad_norm": 0.5236587524414062, + "learning_rate": 1e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.7906650304794312, + "num_tokens": 2149375370.0, + "step": 4484 + }, + { + "epoch": 2.6617210682492582, + "grad_norm": 0.5408154726028442, + "learning_rate": 1e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7707157135009766, + "num_tokens": 2149835702.0, + "step": 4485 + }, + { + "epoch": 2.6623145400593473, + "grad_norm": 0.544536292552948, + "learning_rate": 1e-06, + "loss": 0.6777, + "mean_token_accuracy": 0.7825794219970703, + "num_tokens": 2150333665.0, + "step": 4486 + }, + { + "epoch": 2.662908011869436, + "grad_norm": 0.539438784122467, + "learning_rate": 1e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7759424448013306, + "num_tokens": 2150833545.0, + "step": 4487 + }, + { + "epoch": 2.663501483679525, + "grad_norm": 0.5637767910957336, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7534074783325195, + "num_tokens": 2151287706.0, + "step": 4488 + }, + { + "epoch": 2.664094955489614, + "grad_norm": 0.5405700206756592, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.774671733379364, + "num_tokens": 2151772546.0, + "step": 4489 + }, + { + "epoch": 2.664688427299703, + "grad_norm": 0.5596590638160706, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7634201645851135, + "num_tokens": 2152219210.0, + "step": 4490 + }, + { + "epoch": 2.665281899109792, + "grad_norm": 0.5557166337966919, + "learning_rate": 1e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7777246832847595, + "num_tokens": 2152712204.0, + "step": 4491 + }, + { + "epoch": 2.6658753709198812, + "grad_norm": 0.5302931666374207, + "learning_rate": 1e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.7856985926628113, + "num_tokens": 2153237103.0, + "step": 4492 + }, + { + "epoch": 2.6664688427299703, + "grad_norm": 0.5413264632225037, + "learning_rate": 1e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.780624270439148, + "num_tokens": 2153714524.0, + "step": 4493 + }, + { + "epoch": 2.6670623145400594, + "grad_norm": 0.5360764861106873, + "learning_rate": 1e-06, + "loss": 0.6865, + "mean_token_accuracy": 0.780341386795044, + "num_tokens": 2154190892.0, + "step": 4494 + }, + { + "epoch": 2.6676557863501484, + "grad_norm": 0.5542777180671692, + "learning_rate": 1e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7651420831680298, + "num_tokens": 2154650661.0, + "step": 4495 + }, + { + "epoch": 2.6682492581602375, + "grad_norm": 0.5491435527801514, + "learning_rate": 1e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7729008793830872, + "num_tokens": 2155125556.0, + "step": 4496 + }, + { + "epoch": 2.6688427299703266, + "grad_norm": 0.5952708721160889, + "learning_rate": 1e-06, + "loss": 0.701, + "mean_token_accuracy": 0.777284562587738, + "num_tokens": 2155630134.0, + "step": 4497 + }, + { + "epoch": 2.6694362017804156, + "grad_norm": 0.5770132541656494, + "learning_rate": 1e-06, + "loss": 0.7895, + "mean_token_accuracy": 0.75322425365448, + "num_tokens": 2156089777.0, + "step": 4498 + }, + { + "epoch": 2.6700296735905047, + "grad_norm": 0.5725353360176086, + "learning_rate": 1e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.7869721055030823, + "num_tokens": 2156547905.0, + "step": 4499 + }, + { + "epoch": 2.6706231454005933, + "grad_norm": 0.5811443328857422, + "learning_rate": 1e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.7777478098869324, + "num_tokens": 2157020991.0, + "step": 4500 + }, + { + "epoch": 2.6712166172106824, + "grad_norm": 0.5270571112632751, + "learning_rate": 1e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.7843064069747925, + "num_tokens": 2157522935.0, + "step": 4501 + }, + { + "epoch": 2.6718100890207714, + "grad_norm": 0.5640924572944641, + "learning_rate": 1e-06, + "loss": 0.7169, + "mean_token_accuracy": 0.7730262875556946, + "num_tokens": 2158009791.0, + "step": 4502 + }, + { + "epoch": 2.6724035608308605, + "grad_norm": 0.578839898109436, + "learning_rate": 1e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7866331338882446, + "num_tokens": 2158484900.0, + "step": 4503 + }, + { + "epoch": 2.6729970326409496, + "grad_norm": 0.5561923384666443, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.7639791369438171, + "num_tokens": 2158977442.0, + "step": 4504 + }, + { + "epoch": 2.6735905044510386, + "grad_norm": 0.554504930973053, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7722525596618652, + "num_tokens": 2159488207.0, + "step": 4505 + }, + { + "epoch": 2.6741839762611277, + "grad_norm": 0.5532509088516235, + "learning_rate": 1e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7680519819259644, + "num_tokens": 2159977628.0, + "step": 4506 + }, + { + "epoch": 2.6747774480712168, + "grad_norm": 0.546238362789154, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7610435485839844, + "num_tokens": 2160455152.0, + "step": 4507 + }, + { + "epoch": 2.6753709198813054, + "grad_norm": 0.5461841225624084, + "learning_rate": 1e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7663917541503906, + "num_tokens": 2160936992.0, + "step": 4508 + }, + { + "epoch": 2.6759643916913944, + "grad_norm": 0.5345828533172607, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7720819711685181, + "num_tokens": 2161439051.0, + "step": 4509 + }, + { + "epoch": 2.6765578635014835, + "grad_norm": 0.5544705986976624, + "learning_rate": 1e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7758663892745972, + "num_tokens": 2161926361.0, + "step": 4510 + }, + { + "epoch": 2.6771513353115726, + "grad_norm": 0.5696911811828613, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7619366645812988, + "num_tokens": 2162431521.0, + "step": 4511 + }, + { + "epoch": 2.6777448071216616, + "grad_norm": 0.5249671936035156, + "learning_rate": 1e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7733618021011353, + "num_tokens": 2162919773.0, + "step": 4512 + }, + { + "epoch": 2.6783382789317507, + "grad_norm": 0.588026762008667, + "learning_rate": 1e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7684335708618164, + "num_tokens": 2163368870.0, + "step": 4513 + }, + { + "epoch": 2.6789317507418398, + "grad_norm": 0.5749326348304749, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7706025838851929, + "num_tokens": 2163862093.0, + "step": 4514 + }, + { + "epoch": 2.679525222551929, + "grad_norm": 0.5229614973068237, + "learning_rate": 1e-06, + "loss": 0.6888, + "mean_token_accuracy": 0.7800790071487427, + "num_tokens": 2164339169.0, + "step": 4515 + }, + { + "epoch": 2.680118694362018, + "grad_norm": 0.5368990898132324, + "learning_rate": 1e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7728589773178101, + "num_tokens": 2164825354.0, + "step": 4516 + }, + { + "epoch": 2.680712166172107, + "grad_norm": 0.5584643483161926, + "learning_rate": 1e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7626766562461853, + "num_tokens": 2165283094.0, + "step": 4517 + }, + { + "epoch": 2.681305637982196, + "grad_norm": 0.5289750099182129, + "learning_rate": 1e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7733898162841797, + "num_tokens": 2165766111.0, + "step": 4518 + }, + { + "epoch": 2.681899109792285, + "grad_norm": 0.5333163738250732, + "learning_rate": 1e-06, + "loss": 0.6847, + "mean_token_accuracy": 0.7783709168434143, + "num_tokens": 2166250459.0, + "step": 4519 + }, + { + "epoch": 2.682492581602374, + "grad_norm": 0.5487381815910339, + "learning_rate": 1e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.774849534034729, + "num_tokens": 2166704457.0, + "step": 4520 + }, + { + "epoch": 2.6830860534124628, + "grad_norm": 0.552248477935791, + "learning_rate": 1e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7720527648925781, + "num_tokens": 2167181104.0, + "step": 4521 + }, + { + "epoch": 2.683679525222552, + "grad_norm": 0.5603891015052795, + "learning_rate": 1e-06, + "loss": 0.748, + "mean_token_accuracy": 0.763154149055481, + "num_tokens": 2167630477.0, + "step": 4522 + }, + { + "epoch": 2.684272997032641, + "grad_norm": 0.5485610365867615, + "learning_rate": 1e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7775282859802246, + "num_tokens": 2168069787.0, + "step": 4523 + }, + { + "epoch": 2.68486646884273, + "grad_norm": 0.5164873003959656, + "learning_rate": 1e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7787549495697021, + "num_tokens": 2168571595.0, + "step": 4524 + }, + { + "epoch": 2.685459940652819, + "grad_norm": 0.5486809611320496, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7600874304771423, + "num_tokens": 2169042685.0, + "step": 4525 + }, + { + "epoch": 2.686053412462908, + "grad_norm": 0.5199675559997559, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7635015249252319, + "num_tokens": 2169544572.0, + "step": 4526 + }, + { + "epoch": 2.686646884272997, + "grad_norm": 0.519536554813385, + "learning_rate": 1e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7707134485244751, + "num_tokens": 2170036157.0, + "step": 4527 + }, + { + "epoch": 2.687240356083086, + "grad_norm": 0.5457813739776611, + "learning_rate": 1e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7732863426208496, + "num_tokens": 2170505540.0, + "step": 4528 + }, + { + "epoch": 2.687833827893175, + "grad_norm": 0.5999771952629089, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7710716128349304, + "num_tokens": 2171003414.0, + "step": 4529 + }, + { + "epoch": 2.688427299703264, + "grad_norm": 0.5429704785346985, + "learning_rate": 1e-06, + "loss": 0.7065, + "mean_token_accuracy": 0.7744815349578857, + "num_tokens": 2171473026.0, + "step": 4530 + }, + { + "epoch": 2.689020771513353, + "grad_norm": 0.5244166254997253, + "learning_rate": 1e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7803441286087036, + "num_tokens": 2171966718.0, + "step": 4531 + }, + { + "epoch": 2.689614243323442, + "grad_norm": 0.560799777507782, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7613797187805176, + "num_tokens": 2172398887.0, + "step": 4532 + }, + { + "epoch": 2.690207715133531, + "grad_norm": 0.5181000232696533, + "learning_rate": 1e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.7807880640029907, + "num_tokens": 2172924687.0, + "step": 4533 + }, + { + "epoch": 2.69080118694362, + "grad_norm": 0.5681254267692566, + "learning_rate": 1e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7683509588241577, + "num_tokens": 2173404845.0, + "step": 4534 + }, + { + "epoch": 2.691394658753709, + "grad_norm": 0.5225550532341003, + "learning_rate": 1e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.7577413320541382, + "num_tokens": 2173924966.0, + "step": 4535 + }, + { + "epoch": 2.6919881305637983, + "grad_norm": 0.5554695129394531, + "learning_rate": 1e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.7758132815361023, + "num_tokens": 2174417659.0, + "step": 4536 + }, + { + "epoch": 2.6925816023738873, + "grad_norm": 0.49781814217567444, + "learning_rate": 1e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7746788263320923, + "num_tokens": 2174987385.0, + "step": 4537 + }, + { + "epoch": 2.6931750741839764, + "grad_norm": 0.5399878025054932, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7623719573020935, + "num_tokens": 2175465762.0, + "step": 4538 + }, + { + "epoch": 2.6937685459940655, + "grad_norm": 0.5902793407440186, + "learning_rate": 1e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7659769654273987, + "num_tokens": 2175948464.0, + "step": 4539 + }, + { + "epoch": 2.6943620178041545, + "grad_norm": 0.5364344120025635, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7700626850128174, + "num_tokens": 2176441982.0, + "step": 4540 + }, + { + "epoch": 2.6949554896142436, + "grad_norm": 0.5394247174263, + "learning_rate": 1e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7703231573104858, + "num_tokens": 2176929656.0, + "step": 4541 + }, + { + "epoch": 2.695548961424332, + "grad_norm": 0.596435546875, + "learning_rate": 1e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.777904748916626, + "num_tokens": 2177366133.0, + "step": 4542 + }, + { + "epoch": 2.6961424332344213, + "grad_norm": 0.5388524532318115, + "learning_rate": 1e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.7793943285942078, + "num_tokens": 2177844537.0, + "step": 4543 + }, + { + "epoch": 2.6967359050445103, + "grad_norm": 0.5512536764144897, + "learning_rate": 1e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7707866430282593, + "num_tokens": 2178348855.0, + "step": 4544 + }, + { + "epoch": 2.6973293768545994, + "grad_norm": 0.5764163136482239, + "learning_rate": 1e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7722644805908203, + "num_tokens": 2178814773.0, + "step": 4545 + }, + { + "epoch": 2.6979228486646885, + "grad_norm": 0.566582202911377, + "learning_rate": 1e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.76744544506073, + "num_tokens": 2179292161.0, + "step": 4546 + }, + { + "epoch": 2.6985163204747775, + "grad_norm": 0.5491179823875427, + "learning_rate": 1e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.7846642136573792, + "num_tokens": 2179733270.0, + "step": 4547 + }, + { + "epoch": 2.6991097922848666, + "grad_norm": 0.5424484610557556, + "learning_rate": 1e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7825576663017273, + "num_tokens": 2180232989.0, + "step": 4548 + }, + { + "epoch": 2.6997032640949556, + "grad_norm": 0.5450141429901123, + "learning_rate": 1e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7819325923919678, + "num_tokens": 2180733881.0, + "step": 4549 + }, + { + "epoch": 2.7002967359050443, + "grad_norm": 0.555711030960083, + "learning_rate": 1e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.7746831178665161, + "num_tokens": 2181161095.0, + "step": 4550 + }, + { + "epoch": 2.7008902077151333, + "grad_norm": 0.5700758695602417, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7706434726715088, + "num_tokens": 2181650170.0, + "step": 4551 + }, + { + "epoch": 2.7014836795252224, + "grad_norm": 0.573082685470581, + "learning_rate": 1e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7706056833267212, + "num_tokens": 2182121930.0, + "step": 4552 + }, + { + "epoch": 2.7020771513353115, + "grad_norm": 0.5824128985404968, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7685069441795349, + "num_tokens": 2182569670.0, + "step": 4553 + }, + { + "epoch": 2.7026706231454005, + "grad_norm": 0.5490542054176331, + "learning_rate": 1e-06, + "loss": 0.695, + "mean_token_accuracy": 0.7761044502258301, + "num_tokens": 2183047113.0, + "step": 4554 + }, + { + "epoch": 2.7032640949554896, + "grad_norm": 0.5594859719276428, + "learning_rate": 1e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.7838106751441956, + "num_tokens": 2183532778.0, + "step": 4555 + }, + { + "epoch": 2.7038575667655786, + "grad_norm": 0.6344140768051147, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7541978359222412, + "num_tokens": 2183945923.0, + "step": 4556 + }, + { + "epoch": 2.7044510385756677, + "grad_norm": 0.5531843900680542, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7597919702529907, + "num_tokens": 2184393875.0, + "step": 4557 + }, + { + "epoch": 2.7050445103857568, + "grad_norm": 0.5390114188194275, + "learning_rate": 1e-06, + "loss": 0.668, + "mean_token_accuracy": 0.7863459587097168, + "num_tokens": 2184863409.0, + "step": 4558 + }, + { + "epoch": 2.705637982195846, + "grad_norm": 0.5512251257896423, + "learning_rate": 1e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.7824764847755432, + "num_tokens": 2185354731.0, + "step": 4559 + }, + { + "epoch": 2.706231454005935, + "grad_norm": 0.5703768730163574, + "learning_rate": 1e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7711647748947144, + "num_tokens": 2185792386.0, + "step": 4560 + }, + { + "epoch": 2.706824925816024, + "grad_norm": 0.560881495475769, + "learning_rate": 1e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7814407348632812, + "num_tokens": 2186230718.0, + "step": 4561 + }, + { + "epoch": 2.707418397626113, + "grad_norm": 0.5205045938491821, + "learning_rate": 1e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.7910069227218628, + "num_tokens": 2186763303.0, + "step": 4562 + }, + { + "epoch": 2.7080118694362016, + "grad_norm": 0.5578598976135254, + "learning_rate": 1e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7709678411483765, + "num_tokens": 2187253001.0, + "step": 4563 + }, + { + "epoch": 2.7086053412462907, + "grad_norm": 0.5720000863075256, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7639790773391724, + "num_tokens": 2187707432.0, + "step": 4564 + }, + { + "epoch": 2.7091988130563798, + "grad_norm": 0.5747025012969971, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7694365978240967, + "num_tokens": 2188218456.0, + "step": 4565 + }, + { + "epoch": 2.709792284866469, + "grad_norm": 0.538563072681427, + "learning_rate": 1e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7709476947784424, + "num_tokens": 2188687263.0, + "step": 4566 + }, + { + "epoch": 2.710385756676558, + "grad_norm": 0.5950585007667542, + "learning_rate": 1e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.7790472507476807, + "num_tokens": 2189129949.0, + "step": 4567 + }, + { + "epoch": 2.710979228486647, + "grad_norm": 0.5461045503616333, + "learning_rate": 1e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7711362838745117, + "num_tokens": 2189609767.0, + "step": 4568 + }, + { + "epoch": 2.711572700296736, + "grad_norm": 0.5433474183082581, + "learning_rate": 1e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7802749276161194, + "num_tokens": 2190069683.0, + "step": 4569 + }, + { + "epoch": 2.712166172106825, + "grad_norm": 0.531090497970581, + "learning_rate": 1e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.759769082069397, + "num_tokens": 2190565999.0, + "step": 4570 + }, + { + "epoch": 2.7127596439169137, + "grad_norm": 0.5159995555877686, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7685077786445618, + "num_tokens": 2191079047.0, + "step": 4571 + }, + { + "epoch": 2.7133531157270028, + "grad_norm": 0.5635929107666016, + "learning_rate": 1e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7609531879425049, + "num_tokens": 2191534271.0, + "step": 4572 + }, + { + "epoch": 2.713946587537092, + "grad_norm": 0.5201776623725891, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7767132520675659, + "num_tokens": 2192054177.0, + "step": 4573 + }, + { + "epoch": 2.714540059347181, + "grad_norm": 0.5702383518218994, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7731059193611145, + "num_tokens": 2192514664.0, + "step": 4574 + }, + { + "epoch": 2.71513353115727, + "grad_norm": 0.5552186965942383, + "learning_rate": 1e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7680808901786804, + "num_tokens": 2193008677.0, + "step": 4575 + }, + { + "epoch": 2.715727002967359, + "grad_norm": 0.544712245464325, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7710822224617004, + "num_tokens": 2193497978.0, + "step": 4576 + }, + { + "epoch": 2.716320474777448, + "grad_norm": 0.576723039150238, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7638082504272461, + "num_tokens": 2193937392.0, + "step": 4577 + }, + { + "epoch": 2.716913946587537, + "grad_norm": 0.5580746531486511, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.773891806602478, + "num_tokens": 2194407667.0, + "step": 4578 + }, + { + "epoch": 2.717507418397626, + "grad_norm": 0.515964150428772, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7718582153320312, + "num_tokens": 2194938999.0, + "step": 4579 + }, + { + "epoch": 2.7181008902077153, + "grad_norm": 0.5495865941047668, + "learning_rate": 1e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.776821494102478, + "num_tokens": 2195425411.0, + "step": 4580 + }, + { + "epoch": 2.7186943620178043, + "grad_norm": 0.6206713914871216, + "learning_rate": 1e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7624867558479309, + "num_tokens": 2195893544.0, + "step": 4581 + }, + { + "epoch": 2.7192878338278934, + "grad_norm": 0.5478913187980652, + "learning_rate": 1e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7790707349777222, + "num_tokens": 2196370194.0, + "step": 4582 + }, + { + "epoch": 2.7198813056379825, + "grad_norm": 0.5514094233512878, + "learning_rate": 1e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7799339294433594, + "num_tokens": 2196837929.0, + "step": 4583 + }, + { + "epoch": 2.720474777448071, + "grad_norm": 0.5847609639167786, + "learning_rate": 1e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7624000310897827, + "num_tokens": 2197326505.0, + "step": 4584 + }, + { + "epoch": 2.72106824925816, + "grad_norm": 0.5815389156341553, + "learning_rate": 1e-06, + "loss": 0.6968, + "mean_token_accuracy": 0.7760318517684937, + "num_tokens": 2197794570.0, + "step": 4585 + }, + { + "epoch": 2.721661721068249, + "grad_norm": 0.5240265727043152, + "learning_rate": 1e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.7716158628463745, + "num_tokens": 2198305601.0, + "step": 4586 + }, + { + "epoch": 2.7222551928783383, + "grad_norm": 0.5556561946868896, + "learning_rate": 1e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7619376182556152, + "num_tokens": 2198762111.0, + "step": 4587 + }, + { + "epoch": 2.7228486646884273, + "grad_norm": 0.5344345569610596, + "learning_rate": 1e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7708523273468018, + "num_tokens": 2199271727.0, + "step": 4588 + }, + { + "epoch": 2.7234421364985164, + "grad_norm": 0.5523151755332947, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7652924656867981, + "num_tokens": 2199771005.0, + "step": 4589 + }, + { + "epoch": 2.7240356083086055, + "grad_norm": 0.5616751313209534, + "learning_rate": 1e-06, + "loss": 0.6751, + "mean_token_accuracy": 0.7832478880882263, + "num_tokens": 2200205096.0, + "step": 4590 + }, + { + "epoch": 2.724629080118694, + "grad_norm": 0.555838942527771, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7620784640312195, + "num_tokens": 2200668378.0, + "step": 4591 + }, + { + "epoch": 2.725222551928783, + "grad_norm": 0.557631254196167, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7747693061828613, + "num_tokens": 2201162804.0, + "step": 4592 + }, + { + "epoch": 2.725816023738872, + "grad_norm": 0.5844079256057739, + "learning_rate": 1e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.7868977785110474, + "num_tokens": 2201662509.0, + "step": 4593 + }, + { + "epoch": 2.7264094955489613, + "grad_norm": 0.5285967588424683, + "learning_rate": 1e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7736983299255371, + "num_tokens": 2202167180.0, + "step": 4594 + }, + { + "epoch": 2.7270029673590503, + "grad_norm": 0.5982759594917297, + "learning_rate": 1e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7510405778884888, + "num_tokens": 2202637455.0, + "step": 4595 + }, + { + "epoch": 2.7275964391691394, + "grad_norm": 0.5486510396003723, + "learning_rate": 1e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.756118655204773, + "num_tokens": 2203130744.0, + "step": 4596 + }, + { + "epoch": 2.7281899109792285, + "grad_norm": 0.5787878036499023, + "learning_rate": 1e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7697219848632812, + "num_tokens": 2203586619.0, + "step": 4597 + }, + { + "epoch": 2.7287833827893175, + "grad_norm": 0.5397694706916809, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7743467092514038, + "num_tokens": 2204084420.0, + "step": 4598 + }, + { + "epoch": 2.7293768545994066, + "grad_norm": 0.5586326122283936, + "learning_rate": 1e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7685304880142212, + "num_tokens": 2204588954.0, + "step": 4599 + }, + { + "epoch": 2.7299703264094957, + "grad_norm": 0.59321129322052, + "learning_rate": 1e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7796808481216431, + "num_tokens": 2205099350.0, + "step": 4600 + }, + { + "epoch": 2.7305637982195847, + "grad_norm": 0.5201941132545471, + "learning_rate": 1e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7630807757377625, + "num_tokens": 2205634399.0, + "step": 4601 + }, + { + "epoch": 2.731157270029674, + "grad_norm": 0.5657995939254761, + "learning_rate": 1e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7600604295730591, + "num_tokens": 2206126674.0, + "step": 4602 + }, + { + "epoch": 2.731750741839763, + "grad_norm": 0.5794781446456909, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7727169394493103, + "num_tokens": 2206583794.0, + "step": 4603 + }, + { + "epoch": 2.7323442136498515, + "grad_norm": 0.5399395823478699, + "learning_rate": 1e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7694507241249084, + "num_tokens": 2207094403.0, + "step": 4604 + }, + { + "epoch": 2.7329376854599405, + "grad_norm": 0.5368034839630127, + "learning_rate": 1e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7730978727340698, + "num_tokens": 2207589596.0, + "step": 4605 + }, + { + "epoch": 2.7335311572700296, + "grad_norm": 0.5708180665969849, + "learning_rate": 1e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7738929986953735, + "num_tokens": 2208065107.0, + "step": 4606 + }, + { + "epoch": 2.7341246290801187, + "grad_norm": 0.526122510433197, + "learning_rate": 1e-06, + "loss": 0.6631, + "mean_token_accuracy": 0.7864588499069214, + "num_tokens": 2208580919.0, + "step": 4607 + }, + { + "epoch": 2.7347181008902077, + "grad_norm": 0.5382761359214783, + "learning_rate": 1e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7691528797149658, + "num_tokens": 2209051106.0, + "step": 4608 + }, + { + "epoch": 2.735311572700297, + "grad_norm": 0.5817364454269409, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7728884816169739, + "num_tokens": 2209491717.0, + "step": 4609 + }, + { + "epoch": 2.735905044510386, + "grad_norm": 0.5865426063537598, + "learning_rate": 1e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7748522162437439, + "num_tokens": 2209929675.0, + "step": 4610 + }, + { + "epoch": 2.736498516320475, + "grad_norm": 0.549262285232544, + "learning_rate": 1e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.7811716794967651, + "num_tokens": 2210400928.0, + "step": 4611 + }, + { + "epoch": 2.7370919881305635, + "grad_norm": 0.5422787070274353, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7593823671340942, + "num_tokens": 2210873547.0, + "step": 4612 + }, + { + "epoch": 2.7376854599406526, + "grad_norm": 0.5677167773246765, + "learning_rate": 1e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.7875158190727234, + "num_tokens": 2211356200.0, + "step": 4613 + }, + { + "epoch": 2.7382789317507417, + "grad_norm": 0.5767511129379272, + "learning_rate": 1e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7636907696723938, + "num_tokens": 2211822692.0, + "step": 4614 + }, + { + "epoch": 2.7388724035608307, + "grad_norm": 0.5442925095558167, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7702824473381042, + "num_tokens": 2212313765.0, + "step": 4615 + }, + { + "epoch": 2.73946587537092, + "grad_norm": 0.5445153713226318, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.759141206741333, + "num_tokens": 2212765262.0, + "step": 4616 + }, + { + "epoch": 2.740059347181009, + "grad_norm": 0.5763320326805115, + "learning_rate": 1e-06, + "loss": 0.6726, + "mean_token_accuracy": 0.783610463142395, + "num_tokens": 2213264955.0, + "step": 4617 + }, + { + "epoch": 2.740652818991098, + "grad_norm": 0.5992476344108582, + "learning_rate": 1e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7649233341217041, + "num_tokens": 2213734914.0, + "step": 4618 + }, + { + "epoch": 2.741246290801187, + "grad_norm": 0.5382799506187439, + "learning_rate": 1e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.7850717306137085, + "num_tokens": 2214246001.0, + "step": 4619 + }, + { + "epoch": 2.741839762611276, + "grad_norm": 0.5512813925743103, + "learning_rate": 1e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7769724130630493, + "num_tokens": 2214707241.0, + "step": 4620 + }, + { + "epoch": 2.742433234421365, + "grad_norm": 0.5580154657363892, + "learning_rate": 1e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7722105979919434, + "num_tokens": 2215202006.0, + "step": 4621 + }, + { + "epoch": 2.743026706231454, + "grad_norm": 0.5818661451339722, + "learning_rate": 1e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7667421698570251, + "num_tokens": 2215692712.0, + "step": 4622 + }, + { + "epoch": 2.7436201780415432, + "grad_norm": 0.5423058867454529, + "learning_rate": 1e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7722176909446716, + "num_tokens": 2216193022.0, + "step": 4623 + }, + { + "epoch": 2.7442136498516323, + "grad_norm": 0.5157225131988525, + "learning_rate": 1e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7675536870956421, + "num_tokens": 2216691542.0, + "step": 4624 + }, + { + "epoch": 2.744807121661721, + "grad_norm": 0.541292130947113, + "learning_rate": 1e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7759156227111816, + "num_tokens": 2217168938.0, + "step": 4625 + }, + { + "epoch": 2.74540059347181, + "grad_norm": 0.5781338810920715, + "learning_rate": 1e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7587155103683472, + "num_tokens": 2217696031.0, + "step": 4626 + }, + { + "epoch": 2.745994065281899, + "grad_norm": 0.556384265422821, + "learning_rate": 1e-06, + "loss": 0.6674, + "mean_token_accuracy": 0.7855468988418579, + "num_tokens": 2218153708.0, + "step": 4627 + }, + { + "epoch": 2.746587537091988, + "grad_norm": 0.5403104424476624, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7707209587097168, + "num_tokens": 2218645158.0, + "step": 4628 + }, + { + "epoch": 2.747181008902077, + "grad_norm": 0.5891698002815247, + "learning_rate": 1e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7754697203636169, + "num_tokens": 2219097310.0, + "step": 4629 + }, + { + "epoch": 2.7477744807121662, + "grad_norm": 0.6026095151901245, + "learning_rate": 1e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.7877359986305237, + "num_tokens": 2219577302.0, + "step": 4630 + }, + { + "epoch": 2.7483679525222553, + "grad_norm": 0.5366686582565308, + "learning_rate": 1e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.7750326991081238, + "num_tokens": 2220096537.0, + "step": 4631 + }, + { + "epoch": 2.7489614243323444, + "grad_norm": 0.5921921133995056, + "learning_rate": 1e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7501886487007141, + "num_tokens": 2220525701.0, + "step": 4632 + }, + { + "epoch": 2.749554896142433, + "grad_norm": 0.5663008093833923, + "learning_rate": 1e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.7896304130554199, + "num_tokens": 2220990539.0, + "step": 4633 + }, + { + "epoch": 2.750148367952522, + "grad_norm": 0.6092220544815063, + "learning_rate": 1e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7707438468933105, + "num_tokens": 2221496386.0, + "step": 4634 + }, + { + "epoch": 2.750741839762611, + "grad_norm": 0.5613754391670227, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7739940881729126, + "num_tokens": 2221970017.0, + "step": 4635 + }, + { + "epoch": 2.7513353115727, + "grad_norm": 0.5642778873443604, + "learning_rate": 1e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.7804098129272461, + "num_tokens": 2222435080.0, + "step": 4636 + }, + { + "epoch": 2.7519287833827892, + "grad_norm": 0.5822794437408447, + "learning_rate": 1e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7639051079750061, + "num_tokens": 2222955939.0, + "step": 4637 + }, + { + "epoch": 2.7525222551928783, + "grad_norm": 0.5958338379859924, + "learning_rate": 1e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7572028636932373, + "num_tokens": 2223414620.0, + "step": 4638 + }, + { + "epoch": 2.7531157270029674, + "grad_norm": 0.5310266613960266, + "learning_rate": 1e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7734655141830444, + "num_tokens": 2223924998.0, + "step": 4639 + }, + { + "epoch": 2.7537091988130564, + "grad_norm": 0.531170666217804, + "learning_rate": 1e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.7917360067367554, + "num_tokens": 2224403845.0, + "step": 4640 + }, + { + "epoch": 2.7543026706231455, + "grad_norm": 0.5757725834846497, + "learning_rate": 1e-06, + "loss": 0.7087, + "mean_token_accuracy": 0.7740078568458557, + "num_tokens": 2224842527.0, + "step": 4641 + }, + { + "epoch": 2.7548961424332346, + "grad_norm": 0.6665242910385132, + "learning_rate": 1e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7685776948928833, + "num_tokens": 2225285620.0, + "step": 4642 + }, + { + "epoch": 2.7554896142433236, + "grad_norm": 0.5634599328041077, + "learning_rate": 1e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.7733614444732666, + "num_tokens": 2225804504.0, + "step": 4643 + }, + { + "epoch": 2.7560830860534127, + "grad_norm": 0.580292820930481, + "learning_rate": 1e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.7662144303321838, + "num_tokens": 2226315308.0, + "step": 4644 + }, + { + "epoch": 2.7566765578635017, + "grad_norm": 0.5767566561698914, + "learning_rate": 1e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.775052011013031, + "num_tokens": 2226766427.0, + "step": 4645 + }, + { + "epoch": 2.7572700296735904, + "grad_norm": 0.5655676126480103, + "learning_rate": 1e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.768079400062561, + "num_tokens": 2227248832.0, + "step": 4646 + }, + { + "epoch": 2.7578635014836794, + "grad_norm": 0.558400571346283, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7628053426742554, + "num_tokens": 2227714599.0, + "step": 4647 + }, + { + "epoch": 2.7584569732937685, + "grad_norm": 0.560172975063324, + "learning_rate": 1e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7674989104270935, + "num_tokens": 2228205573.0, + "step": 4648 + }, + { + "epoch": 2.7590504451038576, + "grad_norm": 0.5121063590049744, + "learning_rate": 1e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7762271761894226, + "num_tokens": 2228698511.0, + "step": 4649 + }, + { + "epoch": 2.7596439169139466, + "grad_norm": 0.5783840417861938, + "learning_rate": 1e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7668135762214661, + "num_tokens": 2229136148.0, + "step": 4650 + }, + { + "epoch": 2.7602373887240357, + "grad_norm": 0.5915114283561707, + "learning_rate": 1e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7681060433387756, + "num_tokens": 2229607962.0, + "step": 4651 + }, + { + "epoch": 2.7608308605341247, + "grad_norm": 0.5520285964012146, + "learning_rate": 1e-06, + "loss": 0.673, + "mean_token_accuracy": 0.7830990552902222, + "num_tokens": 2230082445.0, + "step": 4652 + }, + { + "epoch": 2.761424332344214, + "grad_norm": 0.5674372315406799, + "learning_rate": 1e-06, + "loss": 0.659, + "mean_token_accuracy": 0.7855367660522461, + "num_tokens": 2230487318.0, + "step": 4653 + }, + { + "epoch": 2.7620178041543024, + "grad_norm": 0.5304162502288818, + "learning_rate": 1e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.780677318572998, + "num_tokens": 2230999046.0, + "step": 4654 + }, + { + "epoch": 2.7626112759643915, + "grad_norm": 0.5598938465118408, + "learning_rate": 1e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.7792885303497314, + "num_tokens": 2231472238.0, + "step": 4655 + }, + { + "epoch": 2.7632047477744806, + "grad_norm": 0.5441969633102417, + "learning_rate": 1e-06, + "loss": 0.6828, + "mean_token_accuracy": 0.7822801470756531, + "num_tokens": 2231963488.0, + "step": 4656 + }, + { + "epoch": 2.7637982195845696, + "grad_norm": 0.5634807348251343, + "learning_rate": 1e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7722002863883972, + "num_tokens": 2232408529.0, + "step": 4657 + }, + { + "epoch": 2.7643916913946587, + "grad_norm": 0.57313072681427, + "learning_rate": 1e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7683203220367432, + "num_tokens": 2232882338.0, + "step": 4658 + }, + { + "epoch": 2.7649851632047477, + "grad_norm": 0.5557007193565369, + "learning_rate": 1e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7813470363616943, + "num_tokens": 2233380210.0, + "step": 4659 + }, + { + "epoch": 2.765578635014837, + "grad_norm": 0.559677243232727, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7727304697036743, + "num_tokens": 2233877715.0, + "step": 4660 + }, + { + "epoch": 2.766172106824926, + "grad_norm": 0.549206018447876, + "learning_rate": 1e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7794713377952576, + "num_tokens": 2234381836.0, + "step": 4661 + }, + { + "epoch": 2.766765578635015, + "grad_norm": 0.5615181922912598, + "learning_rate": 1e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7777848839759827, + "num_tokens": 2234829815.0, + "step": 4662 + }, + { + "epoch": 2.767359050445104, + "grad_norm": 0.5311378836631775, + "learning_rate": 1e-06, + "loss": 0.6968, + "mean_token_accuracy": 0.775455117225647, + "num_tokens": 2235309702.0, + "step": 4663 + }, + { + "epoch": 2.767952522255193, + "grad_norm": 0.5791686773300171, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7673630118370056, + "num_tokens": 2235768612.0, + "step": 4664 + }, + { + "epoch": 2.768545994065282, + "grad_norm": 0.5527361035346985, + "learning_rate": 1e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7658646702766418, + "num_tokens": 2236244438.0, + "step": 4665 + }, + { + "epoch": 2.769139465875371, + "grad_norm": 0.5375433564186096, + "learning_rate": 1e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7735100388526917, + "num_tokens": 2236738524.0, + "step": 4666 + }, + { + "epoch": 2.76973293768546, + "grad_norm": 0.542554497718811, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7718364000320435, + "num_tokens": 2237227617.0, + "step": 4667 + }, + { + "epoch": 2.770326409495549, + "grad_norm": 0.5336019992828369, + "learning_rate": 1e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7770198583602905, + "num_tokens": 2237706699.0, + "step": 4668 + }, + { + "epoch": 2.770919881305638, + "grad_norm": 0.5600935220718384, + "learning_rate": 1e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7699112296104431, + "num_tokens": 2238180462.0, + "step": 4669 + }, + { + "epoch": 2.771513353115727, + "grad_norm": 0.5244825482368469, + "learning_rate": 1e-06, + "loss": 0.6927, + "mean_token_accuracy": 0.7797643542289734, + "num_tokens": 2238665222.0, + "step": 4670 + }, + { + "epoch": 2.772106824925816, + "grad_norm": 0.5595139265060425, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7631353735923767, + "num_tokens": 2239121713.0, + "step": 4671 + }, + { + "epoch": 2.772700296735905, + "grad_norm": 0.5853440165519714, + "learning_rate": 1e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.7719425559043884, + "num_tokens": 2239545658.0, + "step": 4672 + }, + { + "epoch": 2.773293768545994, + "grad_norm": 0.5252223610877991, + "learning_rate": 1e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.7672436833381653, + "num_tokens": 2240083336.0, + "step": 4673 + }, + { + "epoch": 2.7738872403560833, + "grad_norm": 0.5384024381637573, + "learning_rate": 1e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7664403319358826, + "num_tokens": 2240613282.0, + "step": 4674 + }, + { + "epoch": 2.774480712166172, + "grad_norm": 0.5536361932754517, + "learning_rate": 1e-06, + "loss": 0.676, + "mean_token_accuracy": 0.7823894619941711, + "num_tokens": 2241066728.0, + "step": 4675 + }, + { + "epoch": 2.775074183976261, + "grad_norm": 0.5594276785850525, + "learning_rate": 1e-06, + "loss": 0.705, + "mean_token_accuracy": 0.774811327457428, + "num_tokens": 2241535583.0, + "step": 4676 + }, + { + "epoch": 2.77566765578635, + "grad_norm": 0.5605979561805725, + "learning_rate": 1e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7686885595321655, + "num_tokens": 2241983178.0, + "step": 4677 + }, + { + "epoch": 2.776261127596439, + "grad_norm": 0.5659112334251404, + "learning_rate": 1e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7584531903266907, + "num_tokens": 2242447114.0, + "step": 4678 + }, + { + "epoch": 2.776854599406528, + "grad_norm": 0.551550030708313, + "learning_rate": 1e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7750506401062012, + "num_tokens": 2242906829.0, + "step": 4679 + }, + { + "epoch": 2.777448071216617, + "grad_norm": 0.5377976894378662, + "learning_rate": 1e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7621109485626221, + "num_tokens": 2243381936.0, + "step": 4680 + }, + { + "epoch": 2.7780415430267063, + "grad_norm": 0.563949465751648, + "learning_rate": 1e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.7694374918937683, + "num_tokens": 2243853741.0, + "step": 4681 + }, + { + "epoch": 2.7786350148367953, + "grad_norm": 0.5705959796905518, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.760893702507019, + "num_tokens": 2244314027.0, + "step": 4682 + }, + { + "epoch": 2.7792284866468844, + "grad_norm": 0.5326569676399231, + "learning_rate": 1e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.759487509727478, + "num_tokens": 2244798908.0, + "step": 4683 + }, + { + "epoch": 2.7798219584569734, + "grad_norm": 0.5490124225616455, + "learning_rate": 1e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7719126343727112, + "num_tokens": 2245259898.0, + "step": 4684 + }, + { + "epoch": 2.7804154302670625, + "grad_norm": 0.5580524802207947, + "learning_rate": 1e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7748628854751587, + "num_tokens": 2245702680.0, + "step": 4685 + }, + { + "epoch": 2.7810089020771516, + "grad_norm": 0.533920168876648, + "learning_rate": 1e-06, + "loss": 0.675, + "mean_token_accuracy": 0.7838315367698669, + "num_tokens": 2246144149.0, + "step": 4686 + }, + { + "epoch": 2.7816023738872406, + "grad_norm": 0.5657557845115662, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.766656756401062, + "num_tokens": 2246610081.0, + "step": 4687 + }, + { + "epoch": 2.7821958456973293, + "grad_norm": 0.580577552318573, + "learning_rate": 1e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7691575884819031, + "num_tokens": 2247064571.0, + "step": 4688 + }, + { + "epoch": 2.7827893175074183, + "grad_norm": 0.5571039319038391, + "learning_rate": 1e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7617619037628174, + "num_tokens": 2247551606.0, + "step": 4689 + }, + { + "epoch": 2.7833827893175074, + "grad_norm": 0.5532189607620239, + "learning_rate": 1e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7747043967247009, + "num_tokens": 2248028936.0, + "step": 4690 + }, + { + "epoch": 2.7839762611275964, + "grad_norm": 0.5389842391014099, + "learning_rate": 1e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7782147526741028, + "num_tokens": 2248542115.0, + "step": 4691 + }, + { + "epoch": 2.7845697329376855, + "grad_norm": 0.5305922031402588, + "learning_rate": 1e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7697043418884277, + "num_tokens": 2249023029.0, + "step": 4692 + }, + { + "epoch": 2.7851632047477746, + "grad_norm": 0.5557615756988525, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7654197812080383, + "num_tokens": 2249527291.0, + "step": 4693 + }, + { + "epoch": 2.7857566765578636, + "grad_norm": 0.5087007284164429, + "learning_rate": 1e-06, + "loss": 0.6532, + "mean_token_accuracy": 0.7891642451286316, + "num_tokens": 2250072333.0, + "step": 4694 + }, + { + "epoch": 2.7863501483679523, + "grad_norm": 0.5568538904190063, + "learning_rate": 1e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7652848958969116, + "num_tokens": 2250550704.0, + "step": 4695 + }, + { + "epoch": 2.7869436201780413, + "grad_norm": 0.5361002683639526, + "learning_rate": 1e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7688111662864685, + "num_tokens": 2251067795.0, + "step": 4696 + }, + { + "epoch": 2.7875370919881304, + "grad_norm": 0.572649359703064, + "learning_rate": 1e-06, + "loss": 0.7621, + "mean_token_accuracy": 0.7587903738021851, + "num_tokens": 2251530864.0, + "step": 4697 + }, + { + "epoch": 2.7881305637982194, + "grad_norm": 0.5345184803009033, + "learning_rate": 1e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7706782221794128, + "num_tokens": 2252037628.0, + "step": 4698 + }, + { + "epoch": 2.7887240356083085, + "grad_norm": 0.5359765887260437, + "learning_rate": 1e-06, + "loss": 0.7803, + "mean_token_accuracy": 0.755927562713623, + "num_tokens": 2252538239.0, + "step": 4699 + }, + { + "epoch": 2.7893175074183976, + "grad_norm": 0.564237117767334, + "learning_rate": 1e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7692762613296509, + "num_tokens": 2253009642.0, + "step": 4700 + }, + { + "epoch": 2.7899109792284866, + "grad_norm": 0.5384542942047119, + "learning_rate": 1e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7620165348052979, + "num_tokens": 2253509811.0, + "step": 4701 + }, + { + "epoch": 2.7905044510385757, + "grad_norm": 0.559307873249054, + "learning_rate": 1e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.7941875457763672, + "num_tokens": 2253950207.0, + "step": 4702 + }, + { + "epoch": 2.7910979228486648, + "grad_norm": 0.5917180180549622, + "learning_rate": 1e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.763054370880127, + "num_tokens": 2254411015.0, + "step": 4703 + }, + { + "epoch": 2.791691394658754, + "grad_norm": 0.5729857683181763, + "learning_rate": 1e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7729418873786926, + "num_tokens": 2254883077.0, + "step": 4704 + }, + { + "epoch": 2.792284866468843, + "grad_norm": 0.5690305829048157, + "learning_rate": 1e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7648323774337769, + "num_tokens": 2255313571.0, + "step": 4705 + }, + { + "epoch": 2.792878338278932, + "grad_norm": 0.5478859543800354, + "learning_rate": 1e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7652835845947266, + "num_tokens": 2255790571.0, + "step": 4706 + }, + { + "epoch": 2.793471810089021, + "grad_norm": 0.5931490659713745, + "learning_rate": 1e-06, + "loss": 0.6783, + "mean_token_accuracy": 0.7834347486495972, + "num_tokens": 2256235397.0, + "step": 4707 + }, + { + "epoch": 2.7940652818991096, + "grad_norm": 0.5714335441589355, + "learning_rate": 1e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7663424015045166, + "num_tokens": 2256716275.0, + "step": 4708 + }, + { + "epoch": 2.7946587537091987, + "grad_norm": 0.5296603441238403, + "learning_rate": 1e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7597442865371704, + "num_tokens": 2257227558.0, + "step": 4709 + }, + { + "epoch": 2.7952522255192878, + "grad_norm": 0.5707013607025146, + "learning_rate": 1e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7829220294952393, + "num_tokens": 2257682897.0, + "step": 4710 + }, + { + "epoch": 2.795845697329377, + "grad_norm": 0.5607532262802124, + "learning_rate": 1e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7631469964981079, + "num_tokens": 2258188033.0, + "step": 4711 + }, + { + "epoch": 2.796439169139466, + "grad_norm": 0.5594637393951416, + "learning_rate": 1e-06, + "loss": 0.6905, + "mean_token_accuracy": 0.779924750328064, + "num_tokens": 2258656644.0, + "step": 4712 + }, + { + "epoch": 2.797032640949555, + "grad_norm": 0.5353087186813354, + "learning_rate": 1e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7728300094604492, + "num_tokens": 2259174111.0, + "step": 4713 + }, + { + "epoch": 2.797626112759644, + "grad_norm": 0.5450237989425659, + "learning_rate": 1e-06, + "loss": 0.72, + "mean_token_accuracy": 0.7689762711524963, + "num_tokens": 2259673430.0, + "step": 4714 + }, + { + "epoch": 2.798219584569733, + "grad_norm": 0.5979433655738831, + "learning_rate": 1e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7692953944206238, + "num_tokens": 2260162555.0, + "step": 4715 + }, + { + "epoch": 2.7988130563798217, + "grad_norm": 0.5899866819381714, + "learning_rate": 1e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7695505619049072, + "num_tokens": 2260630667.0, + "step": 4716 + }, + { + "epoch": 2.7994065281899108, + "grad_norm": 0.5586960315704346, + "learning_rate": 1e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.771243691444397, + "num_tokens": 2261070159.0, + "step": 4717 + }, + { + "epoch": 2.8, + "grad_norm": 0.5797593593597412, + "learning_rate": 1e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7758612632751465, + "num_tokens": 2261515360.0, + "step": 4718 + }, + { + "epoch": 2.800593471810089, + "grad_norm": 0.5695315599441528, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7666670083999634, + "num_tokens": 2261987736.0, + "step": 4719 + }, + { + "epoch": 2.801186943620178, + "grad_norm": 0.550327718257904, + "learning_rate": 1e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.7689990997314453, + "num_tokens": 2262424682.0, + "step": 4720 + }, + { + "epoch": 2.801780415430267, + "grad_norm": 0.5189958810806274, + "learning_rate": 1e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7661480903625488, + "num_tokens": 2262927163.0, + "step": 4721 + }, + { + "epoch": 2.802373887240356, + "grad_norm": 0.5200252532958984, + "learning_rate": 1e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7763738632202148, + "num_tokens": 2263484275.0, + "step": 4722 + }, + { + "epoch": 2.802967359050445, + "grad_norm": 0.5326184630393982, + "learning_rate": 1e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.7657769322395325, + "num_tokens": 2263992562.0, + "step": 4723 + }, + { + "epoch": 2.803560830860534, + "grad_norm": 0.5542232990264893, + "learning_rate": 1e-06, + "loss": 0.697, + "mean_token_accuracy": 0.7780044078826904, + "num_tokens": 2264452772.0, + "step": 4724 + }, + { + "epoch": 2.8041543026706233, + "grad_norm": 0.5556995272636414, + "learning_rate": 1e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7662333250045776, + "num_tokens": 2264940240.0, + "step": 4725 + }, + { + "epoch": 2.8047477744807123, + "grad_norm": 0.5391499400138855, + "learning_rate": 1e-06, + "loss": 0.6777, + "mean_token_accuracy": 0.7818865776062012, + "num_tokens": 2265421883.0, + "step": 4726 + }, + { + "epoch": 2.8053412462908014, + "grad_norm": 0.5557413697242737, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7671012282371521, + "num_tokens": 2265885483.0, + "step": 4727 + }, + { + "epoch": 2.8059347181008905, + "grad_norm": 0.5654311776161194, + "learning_rate": 1e-06, + "loss": 0.6793, + "mean_token_accuracy": 0.7819440364837646, + "num_tokens": 2266330881.0, + "step": 4728 + }, + { + "epoch": 2.806528189910979, + "grad_norm": 0.5132797956466675, + "learning_rate": 1e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.7835034132003784, + "num_tokens": 2266852546.0, + "step": 4729 + }, + { + "epoch": 2.807121661721068, + "grad_norm": 0.6079187989234924, + "learning_rate": 1e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.7770702838897705, + "num_tokens": 2267304723.0, + "step": 4730 + }, + { + "epoch": 2.807715133531157, + "grad_norm": 0.5682596564292908, + "learning_rate": 1e-06, + "loss": 0.6798, + "mean_token_accuracy": 0.7810002565383911, + "num_tokens": 2267789493.0, + "step": 4731 + }, + { + "epoch": 2.8083086053412463, + "grad_norm": 0.5353459715843201, + "learning_rate": 1e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7655312418937683, + "num_tokens": 2268308742.0, + "step": 4732 + }, + { + "epoch": 2.8089020771513353, + "grad_norm": 0.5739367008209229, + "learning_rate": 1e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7580651044845581, + "num_tokens": 2268776422.0, + "step": 4733 + }, + { + "epoch": 2.8094955489614244, + "grad_norm": 0.5690391659736633, + "learning_rate": 1e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7777193784713745, + "num_tokens": 2269267288.0, + "step": 4734 + }, + { + "epoch": 2.8100890207715135, + "grad_norm": 0.5535723567008972, + "learning_rate": 1e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7747793197631836, + "num_tokens": 2269745755.0, + "step": 4735 + }, + { + "epoch": 2.8106824925816025, + "grad_norm": 0.5318773984909058, + "learning_rate": 1e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7770931720733643, + "num_tokens": 2270229190.0, + "step": 4736 + }, + { + "epoch": 2.811275964391691, + "grad_norm": 0.5699154734611511, + "learning_rate": 1e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7740440964698792, + "num_tokens": 2270687823.0, + "step": 4737 + }, + { + "epoch": 2.81186943620178, + "grad_norm": 0.559113085269928, + "learning_rate": 1e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7661508321762085, + "num_tokens": 2271189304.0, + "step": 4738 + }, + { + "epoch": 2.8124629080118693, + "grad_norm": 0.5260014533996582, + "learning_rate": 1e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7766270041465759, + "num_tokens": 2271683274.0, + "step": 4739 + }, + { + "epoch": 2.8130563798219583, + "grad_norm": 0.5626552700996399, + "learning_rate": 1e-06, + "loss": 0.784, + "mean_token_accuracy": 0.7514716386795044, + "num_tokens": 2272124379.0, + "step": 4740 + }, + { + "epoch": 2.8136498516320474, + "grad_norm": 0.5883008241653442, + "learning_rate": 1e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.766998291015625, + "num_tokens": 2272588138.0, + "step": 4741 + }, + { + "epoch": 2.8142433234421365, + "grad_norm": 0.5621805787086487, + "learning_rate": 1e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7694021463394165, + "num_tokens": 2273083767.0, + "step": 4742 + }, + { + "epoch": 2.8148367952522255, + "grad_norm": 0.537857174873352, + "learning_rate": 1e-06, + "loss": 0.6468, + "mean_token_accuracy": 0.7894144058227539, + "num_tokens": 2273552612.0, + "step": 4743 + }, + { + "epoch": 2.8154302670623146, + "grad_norm": 0.5511640310287476, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7621155977249146, + "num_tokens": 2274027154.0, + "step": 4744 + }, + { + "epoch": 2.8160237388724036, + "grad_norm": 0.5725818276405334, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7616912126541138, + "num_tokens": 2274502215.0, + "step": 4745 + }, + { + "epoch": 2.8166172106824927, + "grad_norm": 0.5536430478096008, + "learning_rate": 1e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.7849884033203125, + "num_tokens": 2275011324.0, + "step": 4746 + }, + { + "epoch": 2.8172106824925818, + "grad_norm": 0.5359364151954651, + "learning_rate": 1e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7653536200523376, + "num_tokens": 2275509891.0, + "step": 4747 + }, + { + "epoch": 2.817804154302671, + "grad_norm": 0.5503918528556824, + "learning_rate": 1e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7692227363586426, + "num_tokens": 2276000450.0, + "step": 4748 + }, + { + "epoch": 2.81839762611276, + "grad_norm": 0.5488994717597961, + "learning_rate": 1e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7776645421981812, + "num_tokens": 2276505248.0, + "step": 4749 + }, + { + "epoch": 2.8189910979228485, + "grad_norm": 0.5614345073699951, + "learning_rate": 1e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.774263858795166, + "num_tokens": 2276951272.0, + "step": 4750 + }, + { + "epoch": 2.8195845697329376, + "grad_norm": 0.5783262252807617, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7528800964355469, + "num_tokens": 2277430729.0, + "step": 4751 + }, + { + "epoch": 2.8201780415430266, + "grad_norm": 0.5233215093612671, + "learning_rate": 1e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7819168567657471, + "num_tokens": 2277941073.0, + "step": 4752 + }, + { + "epoch": 2.8207715133531157, + "grad_norm": 0.5579380393028259, + "learning_rate": 1e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7689225077629089, + "num_tokens": 2278413329.0, + "step": 4753 + }, + { + "epoch": 2.8213649851632048, + "grad_norm": 0.5396685004234314, + "learning_rate": 1e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.7714216113090515, + "num_tokens": 2278890752.0, + "step": 4754 + }, + { + "epoch": 2.821958456973294, + "grad_norm": 0.5202210545539856, + "learning_rate": 1e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.7733702659606934, + "num_tokens": 2279374946.0, + "step": 4755 + }, + { + "epoch": 2.822551928783383, + "grad_norm": 0.562462568283081, + "learning_rate": 1e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7710986137390137, + "num_tokens": 2279826812.0, + "step": 4756 + }, + { + "epoch": 2.823145400593472, + "grad_norm": 0.5569667220115662, + "learning_rate": 1e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7775020003318787, + "num_tokens": 2280278857.0, + "step": 4757 + }, + { + "epoch": 2.8237388724035606, + "grad_norm": 0.58083176612854, + "learning_rate": 1e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7696602940559387, + "num_tokens": 2280711127.0, + "step": 4758 + }, + { + "epoch": 2.8243323442136496, + "grad_norm": 0.5500974655151367, + "learning_rate": 1e-06, + "loss": 0.6647, + "mean_token_accuracy": 0.7843562364578247, + "num_tokens": 2281206652.0, + "step": 4759 + }, + { + "epoch": 2.8249258160237387, + "grad_norm": 0.5470104217529297, + "learning_rate": 1e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.7796599864959717, + "num_tokens": 2281683307.0, + "step": 4760 + }, + { + "epoch": 2.8255192878338278, + "grad_norm": 0.5828502178192139, + "learning_rate": 1e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7589479088783264, + "num_tokens": 2282132975.0, + "step": 4761 + }, + { + "epoch": 2.826112759643917, + "grad_norm": 0.5744968056678772, + "learning_rate": 1e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7692397236824036, + "num_tokens": 2282618151.0, + "step": 4762 + }, + { + "epoch": 2.826706231454006, + "grad_norm": 0.5376270413398743, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7660316228866577, + "num_tokens": 2283129362.0, + "step": 4763 + }, + { + "epoch": 2.827299703264095, + "grad_norm": 0.5532797574996948, + "learning_rate": 1e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7643024921417236, + "num_tokens": 2283607199.0, + "step": 4764 + }, + { + "epoch": 2.827893175074184, + "grad_norm": 0.5638882517814636, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7601306438446045, + "num_tokens": 2284065897.0, + "step": 4765 + }, + { + "epoch": 2.828486646884273, + "grad_norm": 0.5650655031204224, + "learning_rate": 1e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.774903416633606, + "num_tokens": 2284537300.0, + "step": 4766 + }, + { + "epoch": 2.829080118694362, + "grad_norm": 0.5832340717315674, + "learning_rate": 1e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7811871767044067, + "num_tokens": 2285016457.0, + "step": 4767 + }, + { + "epoch": 2.829673590504451, + "grad_norm": 0.544060230255127, + "learning_rate": 1e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.7782825827598572, + "num_tokens": 2285488825.0, + "step": 4768 + }, + { + "epoch": 2.8302670623145403, + "grad_norm": 0.5778658986091614, + "learning_rate": 1e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7645134925842285, + "num_tokens": 2285981933.0, + "step": 4769 + }, + { + "epoch": 2.8308605341246293, + "grad_norm": 0.5710673928260803, + "learning_rate": 1e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7698423266410828, + "num_tokens": 2286419668.0, + "step": 4770 + }, + { + "epoch": 2.831454005934718, + "grad_norm": 0.5488954186439514, + "learning_rate": 1e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7704037427902222, + "num_tokens": 2286902076.0, + "step": 4771 + }, + { + "epoch": 2.832047477744807, + "grad_norm": 0.5394644141197205, + "learning_rate": 1e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.772157609462738, + "num_tokens": 2287405058.0, + "step": 4772 + }, + { + "epoch": 2.832640949554896, + "grad_norm": 0.5261936187744141, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7715281844139099, + "num_tokens": 2287929604.0, + "step": 4773 + }, + { + "epoch": 2.833234421364985, + "grad_norm": 0.5182908773422241, + "learning_rate": 1e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7791023254394531, + "num_tokens": 2288402442.0, + "step": 4774 + }, + { + "epoch": 2.833827893175074, + "grad_norm": 0.5569829344749451, + "learning_rate": 1e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7669517397880554, + "num_tokens": 2288904419.0, + "step": 4775 + }, + { + "epoch": 2.8344213649851633, + "grad_norm": 0.5636916756629944, + "learning_rate": 1e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7710160613059998, + "num_tokens": 2289362414.0, + "step": 4776 + }, + { + "epoch": 2.8350148367952523, + "grad_norm": 0.5602968335151672, + "learning_rate": 1e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7616914510726929, + "num_tokens": 2289846550.0, + "step": 4777 + }, + { + "epoch": 2.8356083086053414, + "grad_norm": 0.5459055304527283, + "learning_rate": 1e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7687907218933105, + "num_tokens": 2290390470.0, + "step": 4778 + }, + { + "epoch": 2.83620178041543, + "grad_norm": 0.5365071892738342, + "learning_rate": 1e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.784765362739563, + "num_tokens": 2290892950.0, + "step": 4779 + }, + { + "epoch": 2.836795252225519, + "grad_norm": 0.5509793162345886, + "learning_rate": 1e-06, + "loss": 0.707, + "mean_token_accuracy": 0.7749619483947754, + "num_tokens": 2291362498.0, + "step": 4780 + }, + { + "epoch": 2.837388724035608, + "grad_norm": 0.5485178828239441, + "learning_rate": 1e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.7590084671974182, + "num_tokens": 2291819604.0, + "step": 4781 + }, + { + "epoch": 2.837982195845697, + "grad_norm": 0.5740708708763123, + "learning_rate": 1e-06, + "loss": 0.6752, + "mean_token_accuracy": 0.782882809638977, + "num_tokens": 2292256846.0, + "step": 4782 + }, + { + "epoch": 2.8385756676557863, + "grad_norm": 0.5878013968467712, + "learning_rate": 1e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.771349310874939, + "num_tokens": 2292709610.0, + "step": 4783 + }, + { + "epoch": 2.8391691394658753, + "grad_norm": 0.5898747444152832, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7560070157051086, + "num_tokens": 2293172920.0, + "step": 4784 + }, + { + "epoch": 2.8397626112759644, + "grad_norm": 0.530009388923645, + "learning_rate": 1e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7698221802711487, + "num_tokens": 2293646018.0, + "step": 4785 + }, + { + "epoch": 2.8403560830860535, + "grad_norm": 0.5269327759742737, + "learning_rate": 1e-06, + "loss": 0.6626, + "mean_token_accuracy": 0.7852418422698975, + "num_tokens": 2294146068.0, + "step": 4786 + }, + { + "epoch": 2.8409495548961425, + "grad_norm": 0.6034558415412903, + "learning_rate": 1e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7740876078605652, + "num_tokens": 2294564600.0, + "step": 4787 + }, + { + "epoch": 2.8415430267062316, + "grad_norm": 0.600780725479126, + "learning_rate": 1e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7732365727424622, + "num_tokens": 2295036271.0, + "step": 4788 + }, + { + "epoch": 2.8421364985163207, + "grad_norm": 0.590969443321228, + "learning_rate": 1e-06, + "loss": 0.692, + "mean_token_accuracy": 0.7799539566040039, + "num_tokens": 2295485719.0, + "step": 4789 + }, + { + "epoch": 2.8427299703264097, + "grad_norm": 0.5700244903564453, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7647859454154968, + "num_tokens": 2295974065.0, + "step": 4790 + }, + { + "epoch": 2.843323442136499, + "grad_norm": 0.5992419719696045, + "learning_rate": 1e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7747678160667419, + "num_tokens": 2296460413.0, + "step": 4791 + }, + { + "epoch": 2.8439169139465874, + "grad_norm": 0.5509153604507446, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7682723999023438, + "num_tokens": 2296973605.0, + "step": 4792 + }, + { + "epoch": 2.8445103857566765, + "grad_norm": 0.5550484657287598, + "learning_rate": 1e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7723973989486694, + "num_tokens": 2297463924.0, + "step": 4793 + }, + { + "epoch": 2.8451038575667655, + "grad_norm": 0.5608215928077698, + "learning_rate": 1e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7588992118835449, + "num_tokens": 2297930011.0, + "step": 4794 + }, + { + "epoch": 2.8456973293768546, + "grad_norm": 0.5587273836135864, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.774059534072876, + "num_tokens": 2298385466.0, + "step": 4795 + }, + { + "epoch": 2.8462908011869437, + "grad_norm": 0.5700495839118958, + "learning_rate": 1e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.7799458503723145, + "num_tokens": 2298817309.0, + "step": 4796 + }, + { + "epoch": 2.8468842729970327, + "grad_norm": 0.5428284406661987, + "learning_rate": 1e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7653888463973999, + "num_tokens": 2299303179.0, + "step": 4797 + }, + { + "epoch": 2.847477744807122, + "grad_norm": 0.5459060668945312, + "learning_rate": 1e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7664958238601685, + "num_tokens": 2299768349.0, + "step": 4798 + }, + { + "epoch": 2.8480712166172104, + "grad_norm": 0.5021985173225403, + "learning_rate": 1e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7766706943511963, + "num_tokens": 2300291460.0, + "step": 4799 + }, + { + "epoch": 2.8486646884272995, + "grad_norm": 0.5273699164390564, + "learning_rate": 1e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.764885663986206, + "num_tokens": 2300796805.0, + "step": 4800 + }, + { + "epoch": 2.8492581602373885, + "grad_norm": 0.5550548434257507, + "learning_rate": 1e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7508957386016846, + "num_tokens": 2301238279.0, + "step": 4801 + }, + { + "epoch": 2.8498516320474776, + "grad_norm": 0.5390680432319641, + "learning_rate": 1e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7690829038619995, + "num_tokens": 2301728800.0, + "step": 4802 + }, + { + "epoch": 2.8504451038575667, + "grad_norm": 0.5608527660369873, + "learning_rate": 1e-06, + "loss": 0.6712, + "mean_token_accuracy": 0.7844698429107666, + "num_tokens": 2302182960.0, + "step": 4803 + }, + { + "epoch": 2.8510385756676557, + "grad_norm": 0.559909462928772, + "learning_rate": 1e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7706221342086792, + "num_tokens": 2302707621.0, + "step": 4804 + }, + { + "epoch": 2.851632047477745, + "grad_norm": 0.5479744076728821, + "learning_rate": 1e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7590844631195068, + "num_tokens": 2303156221.0, + "step": 4805 + }, + { + "epoch": 2.852225519287834, + "grad_norm": 0.5552799105644226, + "learning_rate": 1e-06, + "loss": 0.735, + "mean_token_accuracy": 0.765655517578125, + "num_tokens": 2303619679.0, + "step": 4806 + }, + { + "epoch": 2.852818991097923, + "grad_norm": 0.5270249843597412, + "learning_rate": 1e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7755554914474487, + "num_tokens": 2304111825.0, + "step": 4807 + }, + { + "epoch": 2.853412462908012, + "grad_norm": 0.5465587973594666, + "learning_rate": 1e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7695193290710449, + "num_tokens": 2304577615.0, + "step": 4808 + }, + { + "epoch": 2.854005934718101, + "grad_norm": 0.5744012594223022, + "learning_rate": 1e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7571125626564026, + "num_tokens": 2305033592.0, + "step": 4809 + }, + { + "epoch": 2.85459940652819, + "grad_norm": 0.5527247786521912, + "learning_rate": 1e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.7756978273391724, + "num_tokens": 2305529317.0, + "step": 4810 + }, + { + "epoch": 2.855192878338279, + "grad_norm": 0.5552204847335815, + "learning_rate": 1e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7579026818275452, + "num_tokens": 2306015432.0, + "step": 4811 + }, + { + "epoch": 2.855786350148368, + "grad_norm": 0.5469757914543152, + "learning_rate": 1e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7624093890190125, + "num_tokens": 2306488411.0, + "step": 4812 + }, + { + "epoch": 2.856379821958457, + "grad_norm": 0.5371850728988647, + "learning_rate": 1e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7780381441116333, + "num_tokens": 2306984029.0, + "step": 4813 + }, + { + "epoch": 2.856973293768546, + "grad_norm": 0.5088154077529907, + "learning_rate": 1e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7782595753669739, + "num_tokens": 2307522231.0, + "step": 4814 + }, + { + "epoch": 2.857566765578635, + "grad_norm": 0.5483364462852478, + "learning_rate": 1e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7779701352119446, + "num_tokens": 2308027131.0, + "step": 4815 + }, + { + "epoch": 2.858160237388724, + "grad_norm": 0.5437062978744507, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.768471360206604, + "num_tokens": 2308508812.0, + "step": 4816 + }, + { + "epoch": 2.858753709198813, + "grad_norm": 0.5539101958274841, + "learning_rate": 1e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7717597484588623, + "num_tokens": 2308957382.0, + "step": 4817 + }, + { + "epoch": 2.859347181008902, + "grad_norm": 0.5570724606513977, + "learning_rate": 1e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7792612314224243, + "num_tokens": 2309475021.0, + "step": 4818 + }, + { + "epoch": 2.8599406528189912, + "grad_norm": 0.5737589597702026, + "learning_rate": 1e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.771830677986145, + "num_tokens": 2309969925.0, + "step": 4819 + }, + { + "epoch": 2.86053412462908, + "grad_norm": 0.6090558171272278, + "learning_rate": 1e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.775256335735321, + "num_tokens": 2310384613.0, + "step": 4820 + }, + { + "epoch": 2.861127596439169, + "grad_norm": 0.5184503793716431, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7741760015487671, + "num_tokens": 2310892184.0, + "step": 4821 + }, + { + "epoch": 2.861721068249258, + "grad_norm": 0.5731357932090759, + "learning_rate": 1e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.75416499376297, + "num_tokens": 2311348877.0, + "step": 4822 + }, + { + "epoch": 2.862314540059347, + "grad_norm": 0.5810136795043945, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7659754753112793, + "num_tokens": 2311799004.0, + "step": 4823 + }, + { + "epoch": 2.862908011869436, + "grad_norm": 0.5728212594985962, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7663046717643738, + "num_tokens": 2312263852.0, + "step": 4824 + }, + { + "epoch": 2.863501483679525, + "grad_norm": 0.5414124727249146, + "learning_rate": 1e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.7702860832214355, + "num_tokens": 2312763332.0, + "step": 4825 + }, + { + "epoch": 2.8640949554896142, + "grad_norm": 0.5306093096733093, + "learning_rate": 1e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7741416096687317, + "num_tokens": 2313242034.0, + "step": 4826 + }, + { + "epoch": 2.8646884272997033, + "grad_norm": 0.5358209609985352, + "learning_rate": 1e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.762752115726471, + "num_tokens": 2313715073.0, + "step": 4827 + }, + { + "epoch": 2.8652818991097924, + "grad_norm": 0.5299983620643616, + "learning_rate": 1e-06, + "loss": 0.6826, + "mean_token_accuracy": 0.7807966470718384, + "num_tokens": 2314215795.0, + "step": 4828 + }, + { + "epoch": 2.8658753709198814, + "grad_norm": 0.5473852753639221, + "learning_rate": 1e-06, + "loss": 0.6649, + "mean_token_accuracy": 0.7846086025238037, + "num_tokens": 2314677931.0, + "step": 4829 + }, + { + "epoch": 2.8664688427299705, + "grad_norm": 0.5799381732940674, + "learning_rate": 1e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.7855854034423828, + "num_tokens": 2315136828.0, + "step": 4830 + }, + { + "epoch": 2.8670623145400596, + "grad_norm": 0.53322434425354, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7744527459144592, + "num_tokens": 2315637939.0, + "step": 4831 + }, + { + "epoch": 2.8676557863501486, + "grad_norm": 0.5737112164497375, + "learning_rate": 1e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7731009721755981, + "num_tokens": 2316112735.0, + "step": 4832 + }, + { + "epoch": 2.8682492581602372, + "grad_norm": 0.5595002770423889, + "learning_rate": 1e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.773669958114624, + "num_tokens": 2316593253.0, + "step": 4833 + }, + { + "epoch": 2.8688427299703263, + "grad_norm": 0.5747542977333069, + "learning_rate": 1e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7664403319358826, + "num_tokens": 2317065626.0, + "step": 4834 + }, + { + "epoch": 2.8694362017804154, + "grad_norm": 0.5320456027984619, + "learning_rate": 1e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7689541578292847, + "num_tokens": 2317559277.0, + "step": 4835 + }, + { + "epoch": 2.8700296735905044, + "grad_norm": 0.5475127696990967, + "learning_rate": 1e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7769619226455688, + "num_tokens": 2318062780.0, + "step": 4836 + }, + { + "epoch": 2.8706231454005935, + "grad_norm": 0.6107671856880188, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7734184265136719, + "num_tokens": 2318487033.0, + "step": 4837 + }, + { + "epoch": 2.8712166172106826, + "grad_norm": 0.5483636260032654, + "learning_rate": 1e-06, + "loss": 0.6661, + "mean_token_accuracy": 0.7866731286048889, + "num_tokens": 2318954676.0, + "step": 4838 + }, + { + "epoch": 2.8718100890207716, + "grad_norm": 0.5490431785583496, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.767334520816803, + "num_tokens": 2319459376.0, + "step": 4839 + }, + { + "epoch": 2.8724035608308607, + "grad_norm": 0.5507588982582092, + "learning_rate": 1e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7763861417770386, + "num_tokens": 2319934125.0, + "step": 4840 + }, + { + "epoch": 2.8729970326409493, + "grad_norm": 0.5736011266708374, + "learning_rate": 1e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7683032155036926, + "num_tokens": 2320366838.0, + "step": 4841 + }, + { + "epoch": 2.8735905044510384, + "grad_norm": 0.5536260008811951, + "learning_rate": 1e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.781898021697998, + "num_tokens": 2320809208.0, + "step": 4842 + }, + { + "epoch": 2.8741839762611274, + "grad_norm": 0.5195608735084534, + "learning_rate": 1e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7787769436836243, + "num_tokens": 2321308688.0, + "step": 4843 + }, + { + "epoch": 2.8747774480712165, + "grad_norm": 0.5603107213973999, + "learning_rate": 1e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7698992490768433, + "num_tokens": 2321775136.0, + "step": 4844 + }, + { + "epoch": 2.8753709198813056, + "grad_norm": 0.5542585253715515, + "learning_rate": 1e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7710611820220947, + "num_tokens": 2322263761.0, + "step": 4845 + }, + { + "epoch": 2.8759643916913946, + "grad_norm": 0.5788277387619019, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7616966962814331, + "num_tokens": 2322713248.0, + "step": 4846 + }, + { + "epoch": 2.8765578635014837, + "grad_norm": 0.5327665209770203, + "learning_rate": 1e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.7801688313484192, + "num_tokens": 2323209786.0, + "step": 4847 + }, + { + "epoch": 2.8771513353115727, + "grad_norm": 0.5354191064834595, + "learning_rate": 1e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.7832210659980774, + "num_tokens": 2323691881.0, + "step": 4848 + }, + { + "epoch": 2.877744807121662, + "grad_norm": 0.5724653005599976, + "learning_rate": 1e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7740939259529114, + "num_tokens": 2324153015.0, + "step": 4849 + }, + { + "epoch": 2.878338278931751, + "grad_norm": 0.532228410243988, + "learning_rate": 1e-06, + "loss": 0.6456, + "mean_token_accuracy": 0.7907827496528625, + "num_tokens": 2324640590.0, + "step": 4850 + }, + { + "epoch": 2.87893175074184, + "grad_norm": 0.5376854538917542, + "learning_rate": 1e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7793747186660767, + "num_tokens": 2325149302.0, + "step": 4851 + }, + { + "epoch": 2.879525222551929, + "grad_norm": 0.5661131143569946, + "learning_rate": 1e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7699694633483887, + "num_tokens": 2325586807.0, + "step": 4852 + }, + { + "epoch": 2.880118694362018, + "grad_norm": 0.5505233407020569, + "learning_rate": 1e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7621561288833618, + "num_tokens": 2326060706.0, + "step": 4853 + }, + { + "epoch": 2.8807121661721067, + "grad_norm": 0.5480473041534424, + "learning_rate": 1e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7668222188949585, + "num_tokens": 2326581610.0, + "step": 4854 + }, + { + "epoch": 2.8813056379821957, + "grad_norm": 0.5357130765914917, + "learning_rate": 1e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7612797021865845, + "num_tokens": 2327075683.0, + "step": 4855 + }, + { + "epoch": 2.881899109792285, + "grad_norm": 0.5958284735679626, + "learning_rate": 1e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7806389927864075, + "num_tokens": 2327492708.0, + "step": 4856 + }, + { + "epoch": 2.882492581602374, + "grad_norm": 0.53373783826828, + "learning_rate": 1e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7802079916000366, + "num_tokens": 2327967283.0, + "step": 4857 + }, + { + "epoch": 2.883086053412463, + "grad_norm": 0.5404695868492126, + "learning_rate": 1e-06, + "loss": 0.6751, + "mean_token_accuracy": 0.78159499168396, + "num_tokens": 2328450541.0, + "step": 4858 + }, + { + "epoch": 2.883679525222552, + "grad_norm": 0.5280027389526367, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7679376602172852, + "num_tokens": 2328928983.0, + "step": 4859 + }, + { + "epoch": 2.884272997032641, + "grad_norm": 0.5294961333274841, + "learning_rate": 1e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.769225537776947, + "num_tokens": 2329438131.0, + "step": 4860 + }, + { + "epoch": 2.88486646884273, + "grad_norm": 0.5361743569374084, + "learning_rate": 1e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7679839134216309, + "num_tokens": 2329932036.0, + "step": 4861 + }, + { + "epoch": 2.8854599406528187, + "grad_norm": 0.5873178839683533, + "learning_rate": 1e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7527902126312256, + "num_tokens": 2330400674.0, + "step": 4862 + }, + { + "epoch": 2.886053412462908, + "grad_norm": 0.5535714626312256, + "learning_rate": 1e-06, + "loss": 0.688, + "mean_token_accuracy": 0.7791175842285156, + "num_tokens": 2330874707.0, + "step": 4863 + }, + { + "epoch": 2.886646884272997, + "grad_norm": 0.5183687806129456, + "learning_rate": 1e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.7715972065925598, + "num_tokens": 2331378806.0, + "step": 4864 + }, + { + "epoch": 2.887240356083086, + "grad_norm": 0.5390891432762146, + "learning_rate": 1e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7753611207008362, + "num_tokens": 2331865285.0, + "step": 4865 + }, + { + "epoch": 2.887833827893175, + "grad_norm": 0.5505747199058533, + "learning_rate": 1e-06, + "loss": 0.6888, + "mean_token_accuracy": 0.7821974158287048, + "num_tokens": 2332358331.0, + "step": 4866 + }, + { + "epoch": 2.888427299703264, + "grad_norm": 0.5361441969871521, + "learning_rate": 1e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.7754369378089905, + "num_tokens": 2332853298.0, + "step": 4867 + }, + { + "epoch": 2.889020771513353, + "grad_norm": 0.5497313737869263, + "learning_rate": 1e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7774227857589722, + "num_tokens": 2333339511.0, + "step": 4868 + }, + { + "epoch": 2.889614243323442, + "grad_norm": 0.5790055990219116, + "learning_rate": 1e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7723234295845032, + "num_tokens": 2333779231.0, + "step": 4869 + }, + { + "epoch": 2.8902077151335313, + "grad_norm": 0.5510768294334412, + "learning_rate": 1e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7729600071907043, + "num_tokens": 2334237169.0, + "step": 4870 + }, + { + "epoch": 2.8908011869436203, + "grad_norm": 0.5293715000152588, + "learning_rate": 1e-06, + "loss": 0.649, + "mean_token_accuracy": 0.7907008528709412, + "num_tokens": 2334743265.0, + "step": 4871 + }, + { + "epoch": 2.8913946587537094, + "grad_norm": 0.5435190200805664, + "learning_rate": 1e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.7782818675041199, + "num_tokens": 2335220248.0, + "step": 4872 + }, + { + "epoch": 2.8919881305637984, + "grad_norm": 0.5133446455001831, + "learning_rate": 1e-06, + "loss": 0.6626, + "mean_token_accuracy": 0.7885414361953735, + "num_tokens": 2335721281.0, + "step": 4873 + }, + { + "epoch": 2.8925816023738875, + "grad_norm": 0.5671408176422119, + "learning_rate": 1e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7700422406196594, + "num_tokens": 2336225368.0, + "step": 4874 + }, + { + "epoch": 2.893175074183976, + "grad_norm": 0.5799161791801453, + "learning_rate": 1e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7790365815162659, + "num_tokens": 2336701123.0, + "step": 4875 + }, + { + "epoch": 2.893768545994065, + "grad_norm": 0.5756129622459412, + "learning_rate": 1e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.776832103729248, + "num_tokens": 2337145624.0, + "step": 4876 + }, + { + "epoch": 2.8943620178041543, + "grad_norm": 0.5362222790718079, + "learning_rate": 1e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7688947319984436, + "num_tokens": 2337634151.0, + "step": 4877 + }, + { + "epoch": 2.8949554896142433, + "grad_norm": 0.5911990404129028, + "learning_rate": 1e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7710949778556824, + "num_tokens": 2338055661.0, + "step": 4878 + }, + { + "epoch": 2.8955489614243324, + "grad_norm": 0.58416748046875, + "learning_rate": 1e-06, + "loss": 0.6709, + "mean_token_accuracy": 0.7809543609619141, + "num_tokens": 2338519354.0, + "step": 4879 + }, + { + "epoch": 2.8961424332344214, + "grad_norm": 0.5520379543304443, + "learning_rate": 1e-06, + "loss": 0.6768, + "mean_token_accuracy": 0.7834974527359009, + "num_tokens": 2339061621.0, + "step": 4880 + }, + { + "epoch": 2.8967359050445105, + "grad_norm": 0.5654736161231995, + "learning_rate": 1e-06, + "loss": 0.6821, + "mean_token_accuracy": 0.7804335355758667, + "num_tokens": 2339519915.0, + "step": 4881 + }, + { + "epoch": 2.8973293768545996, + "grad_norm": 0.5845970511436462, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7600795030593872, + "num_tokens": 2339975684.0, + "step": 4882 + }, + { + "epoch": 2.897922848664688, + "grad_norm": 0.5862986445426941, + "learning_rate": 1e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7565892934799194, + "num_tokens": 2340481557.0, + "step": 4883 + }, + { + "epoch": 2.8985163204747773, + "grad_norm": 0.5805805921554565, + "learning_rate": 1e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7683959007263184, + "num_tokens": 2340947369.0, + "step": 4884 + }, + { + "epoch": 2.8991097922848663, + "grad_norm": 0.5827736258506775, + "learning_rate": 1e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7629773616790771, + "num_tokens": 2341391111.0, + "step": 4885 + }, + { + "epoch": 2.8997032640949554, + "grad_norm": 0.5566491484642029, + "learning_rate": 1e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7778348922729492, + "num_tokens": 2341844682.0, + "step": 4886 + }, + { + "epoch": 2.9002967359050444, + "grad_norm": 0.538300096988678, + "learning_rate": 1e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.7799404263496399, + "num_tokens": 2342356589.0, + "step": 4887 + }, + { + "epoch": 2.9008902077151335, + "grad_norm": 0.5583978295326233, + "learning_rate": 1e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7772443294525146, + "num_tokens": 2342872972.0, + "step": 4888 + }, + { + "epoch": 2.9014836795252226, + "grad_norm": 0.5589712858200073, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.763276219367981, + "num_tokens": 2343322867.0, + "step": 4889 + }, + { + "epoch": 2.9020771513353116, + "grad_norm": 0.5426304936408997, + "learning_rate": 1e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7672028541564941, + "num_tokens": 2343801979.0, + "step": 4890 + }, + { + "epoch": 2.9026706231454007, + "grad_norm": 0.5058507323265076, + "learning_rate": 1e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7748146057128906, + "num_tokens": 2344333409.0, + "step": 4891 + }, + { + "epoch": 2.9032640949554898, + "grad_norm": 0.5272338390350342, + "learning_rate": 1e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.777298092842102, + "num_tokens": 2344832934.0, + "step": 4892 + }, + { + "epoch": 2.903857566765579, + "grad_norm": 0.5726608633995056, + "learning_rate": 1e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.774952232837677, + "num_tokens": 2345311405.0, + "step": 4893 + }, + { + "epoch": 2.904451038575668, + "grad_norm": 0.5495758652687073, + "learning_rate": 1e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7719166278839111, + "num_tokens": 2345775772.0, + "step": 4894 + }, + { + "epoch": 2.905044510385757, + "grad_norm": 0.5169110894203186, + "learning_rate": 1e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7765032052993774, + "num_tokens": 2346273195.0, + "step": 4895 + }, + { + "epoch": 2.9056379821958456, + "grad_norm": 0.5480615496635437, + "learning_rate": 1e-06, + "loss": 0.7105, + "mean_token_accuracy": 0.7737534046173096, + "num_tokens": 2346713878.0, + "step": 4896 + }, + { + "epoch": 2.9062314540059346, + "grad_norm": 0.5875840783119202, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7664005756378174, + "num_tokens": 2347160302.0, + "step": 4897 + }, + { + "epoch": 2.9068249258160237, + "grad_norm": 0.5709108114242554, + "learning_rate": 1e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7750546336174011, + "num_tokens": 2347602502.0, + "step": 4898 + }, + { + "epoch": 2.9074183976261128, + "grad_norm": 0.5349502563476562, + "learning_rate": 1e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7690311074256897, + "num_tokens": 2348124218.0, + "step": 4899 + }, + { + "epoch": 2.908011869436202, + "grad_norm": 0.5413877367973328, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7612307071685791, + "num_tokens": 2348588902.0, + "step": 4900 + }, + { + "epoch": 2.908605341246291, + "grad_norm": 0.5231525897979736, + "learning_rate": 1e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7770471572875977, + "num_tokens": 2349129236.0, + "step": 4901 + }, + { + "epoch": 2.90919881305638, + "grad_norm": 0.5545505285263062, + "learning_rate": 1e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7650912404060364, + "num_tokens": 2349586909.0, + "step": 4902 + }, + { + "epoch": 2.9097922848664686, + "grad_norm": 0.5184145569801331, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7740234136581421, + "num_tokens": 2350093411.0, + "step": 4903 + }, + { + "epoch": 2.9103857566765576, + "grad_norm": 0.5203858613967896, + "learning_rate": 1e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.771818995475769, + "num_tokens": 2350577315.0, + "step": 4904 + }, + { + "epoch": 2.9109792284866467, + "grad_norm": 0.5135760307312012, + "learning_rate": 1e-06, + "loss": 0.6469, + "mean_token_accuracy": 0.7893781661987305, + "num_tokens": 2351091113.0, + "step": 4905 + }, + { + "epoch": 2.9115727002967358, + "grad_norm": 0.5412976741790771, + "learning_rate": 1e-06, + "loss": 0.6663, + "mean_token_accuracy": 0.7871170043945312, + "num_tokens": 2351582612.0, + "step": 4906 + }, + { + "epoch": 2.912166172106825, + "grad_norm": 0.5364720821380615, + "learning_rate": 1e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7743626236915588, + "num_tokens": 2352104303.0, + "step": 4907 + }, + { + "epoch": 2.912759643916914, + "grad_norm": 0.5426978468894958, + "learning_rate": 1e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7712302207946777, + "num_tokens": 2352592556.0, + "step": 4908 + }, + { + "epoch": 2.913353115727003, + "grad_norm": 0.5511394739151001, + "learning_rate": 1e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.779756486415863, + "num_tokens": 2353048964.0, + "step": 4909 + }, + { + "epoch": 2.913946587537092, + "grad_norm": 0.5321063995361328, + "learning_rate": 1e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7635598182678223, + "num_tokens": 2353564179.0, + "step": 4910 + }, + { + "epoch": 2.914540059347181, + "grad_norm": 0.5582547783851624, + "learning_rate": 1e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7750300765037537, + "num_tokens": 2354047310.0, + "step": 4911 + }, + { + "epoch": 2.91513353115727, + "grad_norm": 0.5470387935638428, + "learning_rate": 1e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7519435286521912, + "num_tokens": 2354524886.0, + "step": 4912 + }, + { + "epoch": 2.915727002967359, + "grad_norm": 0.5614876747131348, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7763526439666748, + "num_tokens": 2355014942.0, + "step": 4913 + }, + { + "epoch": 2.9163204747774483, + "grad_norm": 0.5455424189567566, + "learning_rate": 1e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.7719857692718506, + "num_tokens": 2355496476.0, + "step": 4914 + }, + { + "epoch": 2.9169139465875373, + "grad_norm": 0.5936404466629028, + "learning_rate": 1e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7664890289306641, + "num_tokens": 2355949225.0, + "step": 4915 + }, + { + "epoch": 2.917507418397626, + "grad_norm": 0.5371657013893127, + "learning_rate": 1e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7718255519866943, + "num_tokens": 2356424415.0, + "step": 4916 + }, + { + "epoch": 2.918100890207715, + "grad_norm": 0.5676390528678894, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7615419626235962, + "num_tokens": 2356879118.0, + "step": 4917 + }, + { + "epoch": 2.918694362017804, + "grad_norm": 0.5682357549667358, + "learning_rate": 1e-06, + "loss": 0.759, + "mean_token_accuracy": 0.7609484195709229, + "num_tokens": 2357340942.0, + "step": 4918 + }, + { + "epoch": 2.919287833827893, + "grad_norm": 0.5766758322715759, + "learning_rate": 1e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7786633968353271, + "num_tokens": 2357792759.0, + "step": 4919 + }, + { + "epoch": 2.919881305637982, + "grad_norm": 0.5508388876914978, + "learning_rate": 1e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7709887027740479, + "num_tokens": 2358315386.0, + "step": 4920 + }, + { + "epoch": 2.9204747774480713, + "grad_norm": 0.5640551447868347, + "learning_rate": 1e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7655587196350098, + "num_tokens": 2358793879.0, + "step": 4921 + }, + { + "epoch": 2.9210682492581603, + "grad_norm": 0.5510725378990173, + "learning_rate": 1e-06, + "loss": 0.691, + "mean_token_accuracy": 0.7787652015686035, + "num_tokens": 2359271470.0, + "step": 4922 + }, + { + "epoch": 2.9216617210682494, + "grad_norm": 0.5494099855422974, + "learning_rate": 1e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.768274188041687, + "num_tokens": 2359757705.0, + "step": 4923 + }, + { + "epoch": 2.922255192878338, + "grad_norm": 0.5561854839324951, + "learning_rate": 1e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.763363242149353, + "num_tokens": 2360240998.0, + "step": 4924 + }, + { + "epoch": 2.922848664688427, + "grad_norm": 0.5527947545051575, + "learning_rate": 1e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.773683488368988, + "num_tokens": 2360687466.0, + "step": 4925 + }, + { + "epoch": 2.923442136498516, + "grad_norm": 0.5466816425323486, + "learning_rate": 1e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7801269888877869, + "num_tokens": 2361161432.0, + "step": 4926 + }, + { + "epoch": 2.924035608308605, + "grad_norm": 0.5431571006774902, + "learning_rate": 1e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7537744045257568, + "num_tokens": 2361642859.0, + "step": 4927 + }, + { + "epoch": 2.9246290801186943, + "grad_norm": 0.5754122138023376, + "learning_rate": 1e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7589608430862427, + "num_tokens": 2362086699.0, + "step": 4928 + }, + { + "epoch": 2.9252225519287833, + "grad_norm": 0.5712157487869263, + "learning_rate": 1e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7709622383117676, + "num_tokens": 2362557476.0, + "step": 4929 + }, + { + "epoch": 2.9258160237388724, + "grad_norm": 0.5478562116622925, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7650790214538574, + "num_tokens": 2363048406.0, + "step": 4930 + }, + { + "epoch": 2.9264094955489615, + "grad_norm": 0.5481058359146118, + "learning_rate": 1e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7732833623886108, + "num_tokens": 2363507287.0, + "step": 4931 + }, + { + "epoch": 2.9270029673590505, + "grad_norm": 0.5181884169578552, + "learning_rate": 1e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7727581262588501, + "num_tokens": 2364013738.0, + "step": 4932 + }, + { + "epoch": 2.9275964391691396, + "grad_norm": 0.5223705172538757, + "learning_rate": 1e-06, + "loss": 0.666, + "mean_token_accuracy": 0.7834239602088928, + "num_tokens": 2364502818.0, + "step": 4933 + }, + { + "epoch": 2.9281899109792286, + "grad_norm": 0.5629201531410217, + "learning_rate": 1e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7741093635559082, + "num_tokens": 2364969857.0, + "step": 4934 + }, + { + "epoch": 2.9287833827893177, + "grad_norm": 0.5192505121231079, + "learning_rate": 1e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7769861221313477, + "num_tokens": 2365492219.0, + "step": 4935 + }, + { + "epoch": 2.9293768545994068, + "grad_norm": 0.5527504086494446, + "learning_rate": 1e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7588083744049072, + "num_tokens": 2365986415.0, + "step": 4936 + }, + { + "epoch": 2.9299703264094954, + "grad_norm": 0.5573838353157043, + "learning_rate": 1e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.7829415202140808, + "num_tokens": 2366417139.0, + "step": 4937 + }, + { + "epoch": 2.9305637982195845, + "grad_norm": 0.5544165372848511, + "learning_rate": 1e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7771709561347961, + "num_tokens": 2366896945.0, + "step": 4938 + }, + { + "epoch": 2.9311572700296735, + "grad_norm": 0.5624531507492065, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7662383317947388, + "num_tokens": 2367360607.0, + "step": 4939 + }, + { + "epoch": 2.9317507418397626, + "grad_norm": 0.5333840847015381, + "learning_rate": 1e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7733646631240845, + "num_tokens": 2367877851.0, + "step": 4940 + }, + { + "epoch": 2.9323442136498516, + "grad_norm": 0.5441178679466248, + "learning_rate": 1e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7520306706428528, + "num_tokens": 2368375164.0, + "step": 4941 + }, + { + "epoch": 2.9329376854599407, + "grad_norm": 0.5451958179473877, + "learning_rate": 1e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7679140567779541, + "num_tokens": 2368849434.0, + "step": 4942 + }, + { + "epoch": 2.9335311572700298, + "grad_norm": 0.5563727021217346, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7679755091667175, + "num_tokens": 2369349830.0, + "step": 4943 + }, + { + "epoch": 2.934124629080119, + "grad_norm": 0.5425070524215698, + "learning_rate": 1e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7686231732368469, + "num_tokens": 2369837166.0, + "step": 4944 + }, + { + "epoch": 2.9347181008902075, + "grad_norm": 0.5595461130142212, + "learning_rate": 1e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.774844765663147, + "num_tokens": 2370306065.0, + "step": 4945 + }, + { + "epoch": 2.9353115727002965, + "grad_norm": 0.5277206897735596, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7592945098876953, + "num_tokens": 2370770039.0, + "step": 4946 + }, + { + "epoch": 2.9359050445103856, + "grad_norm": 0.5373674035072327, + "learning_rate": 1e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7717893123626709, + "num_tokens": 2371242305.0, + "step": 4947 + }, + { + "epoch": 2.9364985163204746, + "grad_norm": 0.5615442395210266, + "learning_rate": 1e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.7745197415351868, + "num_tokens": 2371696209.0, + "step": 4948 + }, + { + "epoch": 2.9370919881305637, + "grad_norm": 0.5662678480148315, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7544074058532715, + "num_tokens": 2372184359.0, + "step": 4949 + }, + { + "epoch": 2.9376854599406528, + "grad_norm": 0.5584297776222229, + "learning_rate": 1e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.7739205956459045, + "num_tokens": 2372684184.0, + "step": 4950 + }, + { + "epoch": 2.938278931750742, + "grad_norm": 0.5652257204055786, + "learning_rate": 1e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7492403984069824, + "num_tokens": 2373148719.0, + "step": 4951 + }, + { + "epoch": 2.938872403560831, + "grad_norm": 0.5740075707435608, + "learning_rate": 1e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7655190229415894, + "num_tokens": 2373613411.0, + "step": 4952 + }, + { + "epoch": 2.93946587537092, + "grad_norm": 0.5644084215164185, + "learning_rate": 1e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7672505974769592, + "num_tokens": 2374104148.0, + "step": 4953 + }, + { + "epoch": 2.940059347181009, + "grad_norm": 0.544344961643219, + "learning_rate": 1e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7795555591583252, + "num_tokens": 2374595454.0, + "step": 4954 + }, + { + "epoch": 2.940652818991098, + "grad_norm": 0.5356436967849731, + "learning_rate": 1e-06, + "loss": 0.6569, + "mean_token_accuracy": 0.788249671459198, + "num_tokens": 2375093003.0, + "step": 4955 + }, + { + "epoch": 2.941246290801187, + "grad_norm": 0.569143533706665, + "learning_rate": 1e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7819331884384155, + "num_tokens": 2375582388.0, + "step": 4956 + }, + { + "epoch": 2.941839762611276, + "grad_norm": 0.5649272799491882, + "learning_rate": 1e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7651841640472412, + "num_tokens": 2376060130.0, + "step": 4957 + }, + { + "epoch": 2.942433234421365, + "grad_norm": 0.5436400771141052, + "learning_rate": 1e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7646644115447998, + "num_tokens": 2376548123.0, + "step": 4958 + }, + { + "epoch": 2.943026706231454, + "grad_norm": 0.5328571200370789, + "learning_rate": 1e-06, + "loss": 0.6552, + "mean_token_accuracy": 0.7873491048812866, + "num_tokens": 2377013898.0, + "step": 4959 + }, + { + "epoch": 2.943620178041543, + "grad_norm": 0.5484631657600403, + "learning_rate": 1e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.776761531829834, + "num_tokens": 2377534101.0, + "step": 4960 + }, + { + "epoch": 2.944213649851632, + "grad_norm": 0.5312084555625916, + "learning_rate": 1e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7658137083053589, + "num_tokens": 2378018599.0, + "step": 4961 + }, + { + "epoch": 2.944807121661721, + "grad_norm": 0.5811449289321899, + "learning_rate": 1e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.766340970993042, + "num_tokens": 2378486554.0, + "step": 4962 + }, + { + "epoch": 2.94540059347181, + "grad_norm": 0.5502140522003174, + "learning_rate": 1e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7642415165901184, + "num_tokens": 2378962374.0, + "step": 4963 + }, + { + "epoch": 2.945994065281899, + "grad_norm": 0.6273704767227173, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7613207101821899, + "num_tokens": 2379389055.0, + "step": 4964 + }, + { + "epoch": 2.9465875370919883, + "grad_norm": 0.5644503831863403, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7632776498794556, + "num_tokens": 2379878099.0, + "step": 4965 + }, + { + "epoch": 2.947181008902077, + "grad_norm": 0.580134391784668, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7611207962036133, + "num_tokens": 2380336194.0, + "step": 4966 + }, + { + "epoch": 2.947774480712166, + "grad_norm": 0.5427272319793701, + "learning_rate": 1e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.7826765775680542, + "num_tokens": 2380821509.0, + "step": 4967 + }, + { + "epoch": 2.948367952522255, + "grad_norm": 0.5730485916137695, + "learning_rate": 1e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7661361694335938, + "num_tokens": 2381274687.0, + "step": 4968 + }, + { + "epoch": 2.948961424332344, + "grad_norm": 0.5734316110610962, + "learning_rate": 1e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7772420644760132, + "num_tokens": 2381730579.0, + "step": 4969 + }, + { + "epoch": 2.949554896142433, + "grad_norm": 0.5841848254203796, + "learning_rate": 1e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7726914882659912, + "num_tokens": 2382201441.0, + "step": 4970 + }, + { + "epoch": 2.950148367952522, + "grad_norm": 0.5596738457679749, + "learning_rate": 1e-06, + "loss": 0.7105, + "mean_token_accuracy": 0.7703782320022583, + "num_tokens": 2382664994.0, + "step": 4971 + }, + { + "epoch": 2.9507418397626113, + "grad_norm": 0.5162429213523865, + "learning_rate": 1e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7726694345474243, + "num_tokens": 2383179398.0, + "step": 4972 + }, + { + "epoch": 2.9513353115727003, + "grad_norm": 0.5441253185272217, + "learning_rate": 1e-06, + "loss": 0.6718, + "mean_token_accuracy": 0.7844374179840088, + "num_tokens": 2383653538.0, + "step": 4973 + }, + { + "epoch": 2.9519287833827894, + "grad_norm": 0.6178358793258667, + "learning_rate": 1e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7703639268875122, + "num_tokens": 2384139971.0, + "step": 4974 + }, + { + "epoch": 2.9525222551928785, + "grad_norm": 0.5329676866531372, + "learning_rate": 1e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7800217866897583, + "num_tokens": 2384667520.0, + "step": 4975 + }, + { + "epoch": 2.9531157270029675, + "grad_norm": 0.5405617952346802, + "learning_rate": 1e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7783118486404419, + "num_tokens": 2385148936.0, + "step": 4976 + }, + { + "epoch": 2.9537091988130566, + "grad_norm": 0.5532722473144531, + "learning_rate": 1e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7761597633361816, + "num_tokens": 2385637663.0, + "step": 4977 + }, + { + "epoch": 2.9543026706231457, + "grad_norm": 0.566578209400177, + "learning_rate": 1e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7747505307197571, + "num_tokens": 2386110037.0, + "step": 4978 + }, + { + "epoch": 2.9548961424332343, + "grad_norm": 0.6159416437149048, + "learning_rate": 1e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7664897441864014, + "num_tokens": 2386567235.0, + "step": 4979 + }, + { + "epoch": 2.9554896142433233, + "grad_norm": 0.5624906420707703, + "learning_rate": 1e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7664891481399536, + "num_tokens": 2387019398.0, + "step": 4980 + }, + { + "epoch": 2.9560830860534124, + "grad_norm": 0.5441845655441284, + "learning_rate": 1e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7779887914657593, + "num_tokens": 2387514539.0, + "step": 4981 + }, + { + "epoch": 2.9566765578635015, + "grad_norm": 0.5708242654800415, + "learning_rate": 1e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7779679894447327, + "num_tokens": 2388024571.0, + "step": 4982 + }, + { + "epoch": 2.9572700296735905, + "grad_norm": 0.5780516266822815, + "learning_rate": 1e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7687100172042847, + "num_tokens": 2388463952.0, + "step": 4983 + }, + { + "epoch": 2.9578635014836796, + "grad_norm": 0.5561296343803406, + "learning_rate": 1e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7758729457855225, + "num_tokens": 2388898043.0, + "step": 4984 + }, + { + "epoch": 2.9584569732937687, + "grad_norm": 0.5606076717376709, + "learning_rate": 1e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7779069542884827, + "num_tokens": 2389391067.0, + "step": 4985 + }, + { + "epoch": 2.9590504451038577, + "grad_norm": 0.5885737538337708, + "learning_rate": 1e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.758960485458374, + "num_tokens": 2389816623.0, + "step": 4986 + }, + { + "epoch": 2.9596439169139463, + "grad_norm": 0.5325468182563782, + "learning_rate": 1e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.779525101184845, + "num_tokens": 2390302485.0, + "step": 4987 + }, + { + "epoch": 2.9602373887240354, + "grad_norm": 0.5500047206878662, + "learning_rate": 1e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7722299098968506, + "num_tokens": 2390793844.0, + "step": 4988 + }, + { + "epoch": 2.9608308605341245, + "grad_norm": 0.5497153997421265, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7653691172599792, + "num_tokens": 2391274486.0, + "step": 4989 + }, + { + "epoch": 2.9614243323442135, + "grad_norm": 0.5612733960151672, + "learning_rate": 1e-06, + "loss": 0.696, + "mean_token_accuracy": 0.7768434286117554, + "num_tokens": 2391716372.0, + "step": 4990 + }, + { + "epoch": 2.9620178041543026, + "grad_norm": 0.5653722882270813, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7642978429794312, + "num_tokens": 2392194986.0, + "step": 4991 + }, + { + "epoch": 2.9626112759643917, + "grad_norm": 0.5454906225204468, + "learning_rate": 1e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7773983478546143, + "num_tokens": 2392718595.0, + "step": 4992 + }, + { + "epoch": 2.9632047477744807, + "grad_norm": 0.5519161820411682, + "learning_rate": 1e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7720261812210083, + "num_tokens": 2393194504.0, + "step": 4993 + }, + { + "epoch": 2.96379821958457, + "grad_norm": 0.5879726409912109, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7508934736251831, + "num_tokens": 2393645111.0, + "step": 4994 + }, + { + "epoch": 2.964391691394659, + "grad_norm": 0.5362275838851929, + "learning_rate": 1e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7668274641036987, + "num_tokens": 2394152417.0, + "step": 4995 + }, + { + "epoch": 2.964985163204748, + "grad_norm": 0.5697945356369019, + "learning_rate": 1e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7666400074958801, + "num_tokens": 2394591544.0, + "step": 4996 + }, + { + "epoch": 2.965578635014837, + "grad_norm": 0.5801454186439514, + "learning_rate": 1e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7763851284980774, + "num_tokens": 2395067752.0, + "step": 4997 + }, + { + "epoch": 2.966172106824926, + "grad_norm": 0.5511258840560913, + "learning_rate": 1e-06, + "loss": 0.6716, + "mean_token_accuracy": 0.7843531370162964, + "num_tokens": 2395514799.0, + "step": 4998 + }, + { + "epoch": 2.966765578635015, + "grad_norm": 0.5478659272193909, + "learning_rate": 1e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7698720097541809, + "num_tokens": 2396003890.0, + "step": 4999 + }, + { + "epoch": 2.9673590504451037, + "grad_norm": 0.5336816310882568, + "learning_rate": 1e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7819952964782715, + "num_tokens": 2396488631.0, + "step": 5000 + }, + { + "epoch": 2.967952522255193, + "grad_norm": 0.5398428440093994, + "learning_rate": 1e-06, + "loss": 0.6607, + "mean_token_accuracy": 0.7868779897689819, + "num_tokens": 2396981773.0, + "step": 5001 + }, + { + "epoch": 2.968545994065282, + "grad_norm": 0.5353348255157471, + "learning_rate": 1e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.7734880447387695, + "num_tokens": 2397460252.0, + "step": 5002 + }, + { + "epoch": 2.969139465875371, + "grad_norm": 0.534964382648468, + "learning_rate": 1e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.783698320388794, + "num_tokens": 2397957323.0, + "step": 5003 + }, + { + "epoch": 2.96973293768546, + "grad_norm": 0.539745569229126, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.767538845539093, + "num_tokens": 2398450596.0, + "step": 5004 + }, + { + "epoch": 2.970326409495549, + "grad_norm": 0.5312054753303528, + "learning_rate": 1e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7757617235183716, + "num_tokens": 2398929351.0, + "step": 5005 + }, + { + "epoch": 2.970919881305638, + "grad_norm": 0.5948500037193298, + "learning_rate": 1e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7601879239082336, + "num_tokens": 2399355721.0, + "step": 5006 + }, + { + "epoch": 2.9715133531157267, + "grad_norm": 0.5446754097938538, + "learning_rate": 1e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7667378187179565, + "num_tokens": 2399806370.0, + "step": 5007 + }, + { + "epoch": 2.972106824925816, + "grad_norm": 0.5683843493461609, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7642093300819397, + "num_tokens": 2400278699.0, + "step": 5008 + }, + { + "epoch": 2.972700296735905, + "grad_norm": 0.5468249320983887, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.762893795967102, + "num_tokens": 2400768388.0, + "step": 5009 + }, + { + "epoch": 2.973293768545994, + "grad_norm": 0.5738182067871094, + "learning_rate": 1e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7744454145431519, + "num_tokens": 2401240461.0, + "step": 5010 + }, + { + "epoch": 2.973887240356083, + "grad_norm": 0.5765836238861084, + "learning_rate": 1e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7615554928779602, + "num_tokens": 2401699546.0, + "step": 5011 + }, + { + "epoch": 2.974480712166172, + "grad_norm": 0.5775726437568665, + "learning_rate": 1e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7741866111755371, + "num_tokens": 2402139165.0, + "step": 5012 + }, + { + "epoch": 2.975074183976261, + "grad_norm": 0.5367751717567444, + "learning_rate": 1e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7757847905158997, + "num_tokens": 2402630798.0, + "step": 5013 + }, + { + "epoch": 2.97566765578635, + "grad_norm": 0.5306410193443298, + "learning_rate": 1e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7762551307678223, + "num_tokens": 2403140910.0, + "step": 5014 + }, + { + "epoch": 2.9762611275964392, + "grad_norm": 0.5483768582344055, + "learning_rate": 1e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7728720903396606, + "num_tokens": 2403625961.0, + "step": 5015 + }, + { + "epoch": 2.9768545994065283, + "grad_norm": 0.5538246035575867, + "learning_rate": 1e-06, + "loss": 0.6574, + "mean_token_accuracy": 0.787021279335022, + "num_tokens": 2404066965.0, + "step": 5016 + }, + { + "epoch": 2.9774480712166174, + "grad_norm": 0.5574246644973755, + "learning_rate": 1e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.771188497543335, + "num_tokens": 2404541917.0, + "step": 5017 + }, + { + "epoch": 2.9780415430267064, + "grad_norm": 0.5396642684936523, + "learning_rate": 1e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7793450355529785, + "num_tokens": 2405023371.0, + "step": 5018 + }, + { + "epoch": 2.9786350148367955, + "grad_norm": 0.5804563760757446, + "learning_rate": 1e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.7805254459381104, + "num_tokens": 2405519992.0, + "step": 5019 + }, + { + "epoch": 2.979228486646884, + "grad_norm": 0.5435382127761841, + "learning_rate": 1e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.7838354706764221, + "num_tokens": 2406009524.0, + "step": 5020 + }, + { + "epoch": 2.979821958456973, + "grad_norm": 0.5149233341217041, + "learning_rate": 1e-06, + "loss": 0.6847, + "mean_token_accuracy": 0.7801600694656372, + "num_tokens": 2406534536.0, + "step": 5021 + }, + { + "epoch": 2.9804154302670622, + "grad_norm": 0.5360729694366455, + "learning_rate": 1e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.777450680732727, + "num_tokens": 2407053239.0, + "step": 5022 + }, + { + "epoch": 2.9810089020771513, + "grad_norm": 0.5575224757194519, + "learning_rate": 1e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7742500305175781, + "num_tokens": 2407522683.0, + "step": 5023 + }, + { + "epoch": 2.9816023738872404, + "grad_norm": 0.5494571328163147, + "learning_rate": 1e-06, + "loss": 0.6751, + "mean_token_accuracy": 0.7828465700149536, + "num_tokens": 2408015844.0, + "step": 5024 + }, + { + "epoch": 2.9821958456973294, + "grad_norm": 0.6172310709953308, + "learning_rate": 1e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.7748701572418213, + "num_tokens": 2408406215.0, + "step": 5025 + }, + { + "epoch": 2.9827893175074185, + "grad_norm": 0.5355505347251892, + "learning_rate": 1e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7698711156845093, + "num_tokens": 2408909382.0, + "step": 5026 + }, + { + "epoch": 2.9833827893175076, + "grad_norm": 0.5692837238311768, + "learning_rate": 1e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.7740247249603271, + "num_tokens": 2409350703.0, + "step": 5027 + }, + { + "epoch": 2.983976261127596, + "grad_norm": 0.5969734191894531, + "learning_rate": 1e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7741580009460449, + "num_tokens": 2409802500.0, + "step": 5028 + }, + { + "epoch": 2.9845697329376852, + "grad_norm": 0.535711407661438, + "learning_rate": 1e-06, + "loss": 0.6721, + "mean_token_accuracy": 0.7837425470352173, + "num_tokens": 2410288706.0, + "step": 5029 + }, + { + "epoch": 2.9851632047477743, + "grad_norm": 0.5896586775779724, + "learning_rate": 1e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7755166292190552, + "num_tokens": 2410771751.0, + "step": 5030 + }, + { + "epoch": 2.9857566765578634, + "grad_norm": 0.5639784932136536, + "learning_rate": 1e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7725408673286438, + "num_tokens": 2411280702.0, + "step": 5031 + }, + { + "epoch": 2.9863501483679524, + "grad_norm": 0.5725341439247131, + "learning_rate": 1e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7722052931785583, + "num_tokens": 2411746111.0, + "step": 5032 + }, + { + "epoch": 2.9869436201780415, + "grad_norm": 0.5319268703460693, + "learning_rate": 1e-06, + "loss": 0.723, + "mean_token_accuracy": 0.768202543258667, + "num_tokens": 2412225960.0, + "step": 5033 + }, + { + "epoch": 2.9875370919881306, + "grad_norm": 0.5382263660430908, + "learning_rate": 1e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7715917825698853, + "num_tokens": 2412752015.0, + "step": 5034 + }, + { + "epoch": 2.9881305637982196, + "grad_norm": 0.5606957077980042, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7642789483070374, + "num_tokens": 2413248750.0, + "step": 5035 + }, + { + "epoch": 2.9887240356083087, + "grad_norm": 0.551945686340332, + "learning_rate": 1e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7741006016731262, + "num_tokens": 2413730901.0, + "step": 5036 + }, + { + "epoch": 2.9893175074183977, + "grad_norm": 0.5624188780784607, + "learning_rate": 1e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7697665691375732, + "num_tokens": 2414214481.0, + "step": 5037 + }, + { + "epoch": 2.989910979228487, + "grad_norm": 0.5560750961303711, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7737922072410583, + "num_tokens": 2414689501.0, + "step": 5038 + }, + { + "epoch": 2.990504451038576, + "grad_norm": 0.5755695700645447, + "learning_rate": 1e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7734914422035217, + "num_tokens": 2415162397.0, + "step": 5039 + }, + { + "epoch": 2.991097922848665, + "grad_norm": 0.5619514584541321, + "learning_rate": 1e-06, + "loss": 0.6722, + "mean_token_accuracy": 0.7828934788703918, + "num_tokens": 2415609762.0, + "step": 5040 + }, + { + "epoch": 2.9916913946587536, + "grad_norm": 0.6415494084358215, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7558399438858032, + "num_tokens": 2416056168.0, + "step": 5041 + }, + { + "epoch": 2.9922848664688426, + "grad_norm": 0.6412658095359802, + "learning_rate": 1e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7748936414718628, + "num_tokens": 2416547348.0, + "step": 5042 + }, + { + "epoch": 2.9928783382789317, + "grad_norm": 0.5385465025901794, + "learning_rate": 1e-06, + "loss": 0.6955, + "mean_token_accuracy": 0.7774609327316284, + "num_tokens": 2417062091.0, + "step": 5043 + }, + { + "epoch": 2.9934718100890207, + "grad_norm": 0.5727181434631348, + "learning_rate": 1e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.796898603439331, + "num_tokens": 2417547554.0, + "step": 5044 + }, + { + "epoch": 2.99406528189911, + "grad_norm": 0.626861035823822, + "learning_rate": 1e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7722299098968506, + "num_tokens": 2418037285.0, + "step": 5045 + }, + { + "epoch": 2.994658753709199, + "grad_norm": 0.5427464842796326, + "learning_rate": 1e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7721253037452698, + "num_tokens": 2418521298.0, + "step": 5046 + }, + { + "epoch": 2.995252225519288, + "grad_norm": 0.5364473462104797, + "learning_rate": 1e-06, + "loss": 0.687, + "mean_token_accuracy": 0.778609573841095, + "num_tokens": 2419007444.0, + "step": 5047 + }, + { + "epoch": 2.995845697329377, + "grad_norm": 0.6092686057090759, + "learning_rate": 1e-06, + "loss": 0.7169, + "mean_token_accuracy": 0.7698693871498108, + "num_tokens": 2419465347.0, + "step": 5048 + }, + { + "epoch": 2.9964391691394656, + "grad_norm": 0.5869236588478088, + "learning_rate": 1e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.7731796503067017, + "num_tokens": 2419957940.0, + "step": 5049 + }, + { + "epoch": 2.9970326409495547, + "grad_norm": 0.5453228950500488, + "learning_rate": 1e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.768701434135437, + "num_tokens": 2420485319.0, + "step": 5050 + }, + { + "epoch": 2.9976261127596437, + "grad_norm": 0.5523106455802917, + "learning_rate": 1e-06, + "loss": 0.7334, + "mean_token_accuracy": 0.7670307159423828, + "num_tokens": 2420940184.0, + "step": 5051 + }, + { + "epoch": 2.998219584569733, + "grad_norm": 0.5987246036529541, + "learning_rate": 1e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7721089124679565, + "num_tokens": 2421432418.0, + "step": 5052 + }, + { + "epoch": 2.998813056379822, + "grad_norm": 0.5337479710578918, + "learning_rate": 1e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.7800449132919312, + "num_tokens": 2421923154.0, + "step": 5053 + }, + { + "epoch": 2.999406528189911, + "grad_norm": 0.5275169610977173, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7608256340026855, + "num_tokens": 2422436471.0, + "step": 5054 + }, + { + "epoch": 3.0, + "grad_norm": 0.5560516715049744, + "learning_rate": 1e-06, + "loss": 0.699, + "mean_token_accuracy": 0.775320291519165, + "num_tokens": 2422895275.0, + "step": 5055 + }, + { + "epoch": 3.0, + "step": 5055, + "total_flos": 1.0910187920114791e+20, + "train_loss": 0.7627655786175638, + "train_runtime": 75075.1021, + "train_samples_per_second": 10.77, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1, + "max_steps": 5055, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 506, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0910187920114791e+20, + "train_batch_size": 40, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..0507a83 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa97601f50b8bfc98a954e34be5827f3da225aa89e7f66b057530bcfa9f7604f +size 13265